diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index b46cc550c781..a102ce2e99a9 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1,6335 +1,6356 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #if __FreeBSD_version >= 1300102 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef VN_OPEN_INVFS #define VN_OPEN_INVFS 0x0 #endif VFS_SMR_DECLARE; #if __FreeBSD_version < 1300103 #define NDFREE_PNBUF(ndp) NDFREE((ndp), NDF_ONLY_PNBUF) #endif #if __FreeBSD_version >= 1300047 #define vm_page_wire_lock(pp) #define vm_page_wire_unlock(pp) #else #define vm_page_wire_lock(pp) vm_page_lock(pp) #define vm_page_wire_unlock(pp) vm_page_unlock(pp) #endif #ifdef DEBUG_VFS_LOCKS #define VNCHECKREF(vp) \ VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \ ("%s: wrong ref counts", __func__)); #else #define VNCHECKREF(vp) #endif #if __FreeBSD_version >= 1400045 typedef uint64_t cookie_t; #else typedef ulong_t cookie_t; #endif /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr) { (void) cr; znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Keep a count of the synchronous opens in the znode */ if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) { (void) offset, (void) cr; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & O_SYNC) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } static int zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, int *rvalp) { (void) flag, (void) cred, (void) rvalp; loff_t off; int error; switch (com) { case _FIOFFS: { return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ } case _FIOGDIO: case _FIOSDIO: { return (0); } case F_SEEK_DATA: case F_SEEK_HOLE: { off = *(offset_t *)data; /* offset parameter is in/out */ error = zfs_holey(VTOZ(vp), com, &off); if (error) return (error); *(offset_t *)data = off; return (0); } } return (SET_ERROR(ENOTTY)); } static vm_page_t page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) { vm_object_t obj; vm_page_t pp; int64_t end; /* * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE * aligned boundaries, if the range is not aligned. As a result a * DEV_BSIZE subrange with partially dirty data may get marked as clean. * It may happen that all DEV_BSIZE subranges are marked clean and thus * the whole page would be considered clean despite have some * dirty data. * For this reason we should shrink the range to DEV_BSIZE aligned * boundaries before calling vm_page_clear_dirty. */ end = rounddown2(off + nbytes, DEV_BSIZE); off = roundup2(off, DEV_BSIZE); nbytes = end - off; obj = vp->v_object; zfs_vmobject_assert_wlocked_12(obj); #if __FreeBSD_version < 1300050 for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } vm_page_sbusy(pp); } else if (pp != NULL) { ASSERT(!pp->valid); pp = NULL; } if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } break; } #else vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } #endif return (pp); } static void page_unbusy(vm_page_t pp) { vm_page_sunbusy(pp); #if __FreeBSD_version >= 1300041 vm_object_pip_wakeup(pp->object); #else vm_object_pip_subtract(pp->object, 1); #endif } #if __FreeBSD_version > 1300051 static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t m; obj = vp->v_object; vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY); return (m); } #else static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t pp; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_wire_lock(pp); vm_page_hold(pp); vm_page_wire_unlock(pp); } else pp = NULL; break; } return (pp); } #endif static void page_unhold(vm_page_t pp) { vm_page_wire_lock(pp); #if __FreeBSD_version >= 1300035 vm_page_unwire(pp, PQ_ACTIVE); #else vm_page_unhold(pp); #endif vm_page_wire_unlock(pp); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { vm_object_t obj; struct sf_buf *sf; vnode_t *vp = ZTOV(zp); caddr_t va; int off; ASSERT3P(vp->v_mount, !=, NULL); obj = vp->v_object; ASSERT3P(obj, !=, NULL); off = start & PAGEOFFSET; zfs_vmobject_wlock_12(obj); #if __FreeBSD_version >= 1300041 vm_object_pip_add(obj, 1); #endif for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; int nbytes = imin(PAGESIZE - off, len); if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); (void) dmu_read(os, zp->z_id, start + off, nbytes, va + off, DMU_READ_PREFETCH); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); page_unbusy(pp); } len -= nbytes; off = 0; } #if __FreeBSD_version >= 1300041 vm_object_pip_wakeup(obj); #else vm_object_pip_wakeupn(obj, 0); #endif zfs_vmobject_wunlock_12(obj); } /* * Read with UIO_NOCOPY flag means that sendfile(2) requests * ZFS to populate a range of page cache pages with data. * * NOTE: this function could be optimized to pre-allocate * all pages in advance, drain exclusive busy on all of them, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ int mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio) { vnode_t *vp = ZTOV(zp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; vm_page_t pp; int64_t start; caddr_t va; int len = nbytes; int error = 0; ASSERT3U(zfs_uio_segflg(uio), ==, UIO_NOCOPY); ASSERT3P(vp->v_mount, !=, NULL); obj = vp->v_object; ASSERT3P(obj, !=, NULL); ASSERT0(zfs_uio_offset(uio) & PAGEOFFSET); zfs_vmobject_wlock_12(obj); for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (vm_page_none_valid(pp)) { zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) memset(va + bytes, 0, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); #if __FreeBSD_version >= 1300081 if (error == 0) { vm_page_valid(pp); vm_page_activate(pp); vm_page_do_sunbusy(pp); } else { zfs_vmobject_wlock(obj); if (!vm_page_wired(pp) && pp->valid == 0 && vm_page_busy_tryupgrade(pp)) vm_page_free(pp); else vm_page_sunbusy(pp); zfs_vmobject_wunlock(obj); } #else vm_page_do_sunbusy(pp); vm_page_lock(pp); if (error) { if (pp->wire_count == 0 && pp->valid == 0 && !vm_page_busied(pp)) vm_page_free(pp); } else { pp->valid = VM_PAGE_BITS_ALL; vm_page_activate(pp); } vm_page_unlock(pp); #endif } else { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_do_sunbusy(pp); } if (error) break; zfs_uio_advance(uio, bytes); len -= bytes; } zfs_vmobject_wunlock_12(obj); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { vnode_t *vp = ZTOV(zp); vm_object_t obj; int64_t start; int len = nbytes; int off; int error = 0; ASSERT3P(vp->v_mount, !=, NULL); obj = vp->v_object; ASSERT3P(obj, !=, NULL); start = zfs_uio_offset(uio); off = start & PAGEOFFSET; zfs_vmobject_wlock_12(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); if ((pp = page_hold(vp, start))) { struct sf_buf *sf; caddr_t va; zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); error = vn_io_fault_uiomove(va + off, bytes, GET_UIO_STRUCT(uio)); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); page_unhold(pp); } else { zfs_vmobject_wunlock_12(obj); error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); zfs_vmobject_wlock_12(obj); } len -= bytes; off = 0; if (error) break; } zfs_vmobject_wunlock_12(obj); return (error); } int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *presid) { int error = 0; ssize_t resid; error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos, UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread); if (error) { return (SET_ERROR(error)); } else if (presid == NULL) { if (resid != 0) { error = SET_ERROR(EIO); } } else { *presid = resid; } return (error); } void zfs_zrele_async(znode_t *zp) { vnode_t *vp = ZTOV(zp); objset_t *os = ITOZSB(vp)->z_os; VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os))); } static int zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { int error; *vpp = arg; error = vn_lock(*vpp, lkflags); if (error != 0) vrele(*vpp); return (error); } static int zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs; int error; int ltype; if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(dvp, __func__); if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { ASSERT3P(dvp, ==, vp); vref(dvp); ltype = lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case could leave us with * reclaimed vnode. */ if (VN_IS_DOOMED(dvp)) { vrele(dvp); return (SET_ERROR(ENOENT)); } } return (0); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { /* * Note that in this case, dvp is the child vnode, and we * are looking up the parent vnode - exactly reverse from * normal operation. Unlocking dvp requires some rather * tricky unlock/relock dance to prevent mp from being freed; * use vn_vget_ino_gen() which takes care of all that. * * XXX Note that there is a time window when both vnodes are * unlocked. It is possible, although highly unlikely, that * during that window the parent-child relationship between * the vnodes may change, for example, get reversed. * In that case we would have a wrong lock order for the vnodes. * All other filesystems seem to ignore this problem, so we * do the same here. * A potential solution could be implemented as follows: * - using LK_NOWAIT when locking the second vnode and retrying * if necessary * - checking that the parent-child relationship still holds * after locking both vnodes and retrying if it doesn't */ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); return (error); } else { error = vn_lock(vp, lkflags); if (error != 0) vrele(vp); return (error); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ static int zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, int flags, boolean_t cached) { znode_t *zdp = VTOZ(dvp); znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; #if __FreeBSD_version > 1300124 seqc_t dvp_seqc; #endif int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, const char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); #if __FreeBSD_version > 1300124 dvp_seqc = vn_seqc_read_notmodify(dvp); #endif *vpp = NULL; if (flags & LOOKUP_XATTR) { /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOPNOTSUPP)); } /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) { ZFS_EXIT(zfsvfs); return (error); } *vpp = ZTOV(zp); /* * Do we have permission to get into attribute directory? */ error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr); if (error) { vrele(ZTOV(zp)); } ZFS_EXIT(zfsvfs); return (error); } /* * Check accessibility of directory if we're not coming in via * VOP_CACHEDLOOKUP. */ if (!cached) { #ifdef NOEXECCHECK if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; } else #endif if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * First handle the special cases. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { struct componentname cn; vnode_t *zfsctl_vp; int ltype; ZFS_EXIT(zfsvfs); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK1(dvp); error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, &zfsctl_vp); if (error == 0) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = cnp->cn_nameiop; cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; cn.cn_lkflags = cnp->cn_lkflags; error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); vput(zfsctl_vp); } vn_lock(dvp, ltype | LK_RETRY); return (error); } } if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { ZFS_EXIT(zfsvfs); if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); return (error); } /* * The loop is retry the lookup if the parent-child relationship * changes during the dot-dot locking complexities. */ for (;;) { uint64_t parent; error = zfs_dirlook(zdp, nm, &zp); if (error == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (error != 0) break; error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); if (error != 0) { /* * If we've got a locking error, then the vnode * got reclaimed because of a force unmount. * We never enter doomed vnodes into the name cache. */ *vpp = NULL; return (error); } if ((cnp->cn_flags & ISDOTDOT) == 0) break; ZFS_ENTER(zfsvfs); if (zdp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); } else { error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); } if (error != 0) { ZFS_EXIT(zfsvfs); vput(ZTOV(zp)); break; } if (zp->z_id == parent) { ZFS_EXIT(zfsvfs); break; } vput(ZTOV(zp)); } if (error != 0) *vpp = NULL; /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; +#if __FreeBSD_version < 1400068 cnp->cn_flags |= SAVENAME; +#endif break; } zfs_fallthrough; case DELETE: +#if __FreeBSD_version < 1400068 if (error == 0) cnp->cn_flags |= SAVENAME; +#endif break; } } #if __FreeBSD_version > 1300124 if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * FIXME: zfs_lookup_lock relocks vnodes and does nothing to * handle races. In particular different callers may end up * with different vnodes and will try to add conflicting * entries to the namecache. * * While finding different result may be acceptable in face * of concurrent modification, adding conflicting entries * trips over an assert in the namecache. * * Ultimately let an entry through once everything settles. */ if (!vn_seqc_consistent(dvp, dvp_seqc)) { cnp->cn_flags &= ~MAKEENTRY; } } #endif /* Insert name into cache (as non-existent) if appropriate. */ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(dvp, NULL, cnp); /* Insert name into cache if appropriate. */ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ int zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) { (void) excl, (void) mode, (void) flag; znode_t *zp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; dmu_tx_t *tx; int error; uid_t uid = crgetuid(cr); gid_t gid = crgetgid(cr); uint64_t projid = ZFS_DEFAULT_PROJID; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype; #ifdef DEBUG_VFS_LOCKS vnode_t *dvp = ZTOV(dzp); #endif /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } *zpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { error = SET_ERROR(EINVAL); goto out; } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } getnewvnode_reserve_(); tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); out: VNCHECKREF(dvp); if (error == 0) { *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ static int zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t xattr_obj; uint64_t obj = 0; dmu_tx_t *tx; boolean_t unlinked; uint64_t txtype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zp = VTOZ(vp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; xattr_obj = 0; xzp = NULL; if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); obj = zp->z_id; /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); } /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (xzp) { dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { zfs_unlinked_add(zp, tx); vp->v_vflag |= VV_NOSYNC; } /* XXX check changes to linux vnops */ txtype = TX_REMOVE; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (xzp) vrele(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } static int zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp, struct componentname *cnp, int nameiop) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int error; cnp->cn_nameptr = __DECONST(char *, name); cnp->cn_namelen = strlen(name); cnp->cn_nameiop = nameiop; - cnp->cn_flags = ISLASTCN | SAVENAME; + cnp->cn_flags = ISLASTCN; +#if __FreeBSD_version < 1400068 + cnp->cn_flags |= SAVENAME; +#endif cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY; cnp->cn_cred = kcred; #if __FreeBSD_version < 1400037 cnp->cn_thread = curthread; #endif if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) { struct vop_lookup_args a; a.a_gen.a_desc = &vop_lookup_desc; a.a_dvp = ZTOV(dzp); a.a_vpp = vpp; a.a_cnp = cnp; error = vfs_cache_lookup(&a); } else { error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, 0, B_FALSE); } #ifdef ZFS_DEBUG if (error) { printf("got error %d on name %s on op %d\n", error, name, nameiop); kdb_backtrace(); } #endif return (error); } int zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags) { vnode_t *vp; int error; struct componentname cn; if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) return (error); error = zfs_remove_(ZTOV(dzp), vp, name, cr); vput(vp); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ int zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp) { (void) flags, (void) vsecp; znode_t *zp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t txtype; dmu_tx_t *tx; int error; uid_t uid = crgetuid(cr); gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT3U(vap->va_type, ==, VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ if (zfsvfs->z_use_fuids == B_FALSE && ((vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ *zpp = NULL; if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ getnewvnode_reserve_(); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } #if __FreeBSD_version < 1300124 static void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) { cache_purge(dvp); cache_purge(vp); } #endif /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ static int zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } vnevent_rmdir(vp, dvp, name, ct); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); cache_vop_rmdir(dvp, vp); out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } int zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags) { struct componentname cn; vnode_t *vp; int error; if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) return (error); error = zfs_rmdir_(ZTOV(dzp), vp, name, cr); vput(vp); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ static int zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, int *ncookies, cookie_t **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; int ncooks; cookie_t *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = zfs_uio_offset(uio); prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = GET_UIO_STRUCT(uio)->uio_iov; bytes_wanted = iovp->iov_len; if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) - sizeof (((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof (*cooks), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ #ifdef TODO check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); #else check_sysattrs = 0; #endif /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, &zap))) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); if (check_sysattrs && !zap.za_normalization_conflict) { #ifdef TODO zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); #else panic("%s:%u: TODO", __func__, __LINE__); #endif } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { vrele(ZTOV(ezp)); goto skip_entry; } vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry */ next = &(eodp->ed_off); eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); /* NOTE: d_off is the offset for the *next* entry. */ next = &odp->d_off; strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; dirent_terminate(odp); odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT3S(outcount, <=, bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } /* Fill the offset right after advancing the cursor. */ if (next != NULL) *next = offset; if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; zfs_uio_resid(uio) -= outcount; } else if ((error = zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) { /* * Reset the pointer. */ offset = zfs_uio_offset(uio); } update: zap_cursor_fini(&zc); if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); zfs_uio_setoffset(uio, offset); ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[4]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr))) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; vn_fsid(vp, vap); vap->va_nodeid = zp->z_id; vap->va_nlink = zp->z_links; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && zp->z_links < ZFS_LINK_MAX) vap->va_nlink++; vap->va_size = zp->z_size; if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); vap->va_gen = zp->z_gen; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { xoap->xoa_projinherit = ((zp->z_pflags & ZFS_PROJINHERIT) != 0); XVA_SET_RTN(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { xoap->xoa_projid = zp->z_projid; XVA_SET_RTN(xvap, XAT_PROJID); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) { vnode_t *vp = ZTOV(zp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Note: ZFS_READONLY is handled in zfs_zaccess_common. */ /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } if (xoap != NULL && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) && TIMESPEC_OVERFLOW(&vap->va_birthtime)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOPNOTSUPP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOPNOTSUPP)); } } attrzp = NULL; aclp = NULL; if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; if (trim_mask & AT_MODE) { /* * Save the mode, as secpolicy_vnode_setattr() * will overwrite it with ova.va_mode. */ saved_mode = vap->va_mode; } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) { vap->va_mask |= saved_mask; if (trim_mask & AT_MODE) { /* * Recover the mode after * secpolicy_vnode_setattr(). */ vap->va_mode = saved_mode; } } } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err == 0) { err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); if (err != 0) vrele(ZTOV(attrzp)); } if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_uid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_gid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } tx = dmu_tx_create(os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT0(err); if (attrzp) { vn_seqc_write_begin(ZTOV(attrzp)); err = zfs_acl_chown_setattr(attrzp); vn_seqc_write_end(ZTOV(attrzp)); ASSERT0(err); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) xoap->xoa_createtime = vap->va_birthtime; /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT3S(vp->v_type, ==, VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT0(err2); } if (attrzp) vput(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } /* * Look up the directory entries corresponding to the source and target * directory/name pairs. */ static int zfs_rename_relock_lookup(znode_t *sdzp, const struct componentname *scnp, znode_t **szpp, znode_t *tdzp, const struct componentname *tcnp, znode_t **tzpp) { zfsvfs_t *zfsvfs; znode_t *szp, *tzp; int error; /* * Before using sdzp and tdzp we must ensure that they are live. * As a porting legacy from illumos we have two things to worry * about. One is typical for FreeBSD and it is that the vnode is * not reclaimed (doomed). The other is that the znode is live. * The current code can invalidate the znode without acquiring the * corresponding vnode lock if the object represented by the znode * and vnode is no longer valid after a rollback or receive operation. * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock * that protects the znodes from the invalidation. */ zfsvfs = sdzp->z_zfsvfs; ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(sdzp); ZFS_VERIFY_ZP(tdzp); /* * Re-resolve svp to be certain it still exists and fetch the * correct vnode. */ error = zfs_dirent_lookup(sdzp, scnp->cn_nameptr, &szp, ZEXISTS); if (error != 0) { /* Source entry invalid or not there. */ if ((scnp->cn_flags & ISDOTDOT) != 0 || (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) error = SET_ERROR(EINVAL); goto out; } *szpp = szp; /* * Re-resolve tvp, if it disappeared we just carry on. */ error = zfs_dirent_lookup(tdzp, tcnp->cn_nameptr, &tzp, 0); if (error != 0) { vrele(ZTOV(szp)); if ((tcnp->cn_flags & ISDOTDOT) != 0) error = SET_ERROR(EINVAL); goto out; } *tzpp = tzp; out: ZFS_EXIT(zfsvfs); return (error); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, struct vnode *tdvp, struct vnode **tvpp, const struct componentname *scnp, const struct componentname *tcnp) { struct vnode *nvp, *svp, *tvp; znode_t *sdzp, *tdzp, *szp, *tzp; int error; VOP_UNLOCK1(tdvp); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK1(*tvpp); relock: error = vn_lock(sdvp, LK_EXCLUSIVE); if (error) goto out; error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK1(sdvp); if (error != EBUSY) goto out; error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto out; VOP_UNLOCK1(tdvp); goto relock; } tdzp = VTOZ(tdvp); sdzp = VTOZ(sdvp); error = zfs_rename_relock_lookup(sdzp, scnp, &szp, tdzp, tcnp, &tzp); if (error != 0) { VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); goto out; } svp = ZTOV(szp); tvp = tzp != NULL ? ZTOV(tzp) : NULL; /* * Now try acquire locks on svp and tvp. */ nvp = svp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); if (tvp != NULL) vrele(tvp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } VOP_UNLOCK1(nvp); /* * Concurrent rename race. * XXX ? */ if (nvp == tdvp) { vrele(nvp); error = SET_ERROR(EINVAL); goto out; } vrele(*svpp); *svpp = nvp; goto relock; } vrele(*svpp); *svpp = nvp; if (*tvpp != NULL) vrele(*tvpp); *tvpp = NULL; if (tvp != NULL) { nvp = tvp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); VOP_UNLOCK1(*svpp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } vput(nvp); goto relock; } *tvpp = nvp; } return (0); out: return (error); } /* * Note that we must use VRELE_ASYNC in this function as it walks * up the directory tree and vrele may need to acquire an exclusive * lock if a last reference to a vnode is dropped. */ static int zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) { zfsvfs_t *zfsvfs; znode_t *zp, *zp1; uint64_t parent; int error; zfsvfs = tdzp->z_zfsvfs; if (tdzp == szp) return (SET_ERROR(EINVAL)); if (tdzp == sdzp) return (0); if (tdzp->z_id == zfsvfs->z_root) return (0); zp = tdzp; for (;;) { ASSERT(!zp->z_unlinked); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) break; if (parent == szp->z_id) { error = SET_ERROR(EINVAL); break; } if (parent == zfsvfs->z_root) break; if (parent == sdzp->z_id) break; error = zfs_zget(zfsvfs, parent, &zp1); if (error != 0) break; if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os))); zp = zp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os))); return (error); } #if __FreeBSD_version < 1300124 static void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) { cache_purge(fvp); if (tvp != NULL) cache_purge(tvp); cache_purge_negative(tdvp); } #endif static int zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr); /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * scnp - Old entry name. * tdvp - Target directory to contain the "new entry". * tcnp - New entry name. * cr - credentials of caller. * INOUT: svpp - Source file * tvpp - Target file, may point to NULL initially * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ static int zfs_do_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr) { int error; ASSERT_VOP_ELOCKED(tdvp, __func__); if (*tvpp != NULL) ASSERT_VOP_ELOCKED(*tvpp, __func__); /* Reject renames across filesystems. */ if ((*svpp)->v_mount != tdvp->v_mount || ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { error = SET_ERROR(EXDEV); goto out; } if (zfsctl_is_node(tdvp)) { error = SET_ERROR(EXDEV); goto out; } /* * Lock all four vnodes to ensure safety and semantics of renaming. */ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); if (error != 0) { /* no vnodes are locked in the case of error here */ return (error); } error = zfs_do_rename_impl(sdvp, svpp, scnp, tdvp, tvpp, tcnp, cr); VOP_UNLOCK1(sdvp); VOP_UNLOCK1(*svpp); out: if (*tvpp != NULL) VOP_UNLOCK1(*tvpp); if (tdvp != *tvpp) VOP_UNLOCK1(tdvp); return (error); } static int zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr) { dmu_tx_t *tx; zfsvfs_t *zfsvfs; zilog_t *zilog; znode_t *tdzp, *sdzp, *tzp, *szp; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error; tdzp = VTOZ(tdvp); sdzp = VTOZ(sdvp); zfsvfs = tdzp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(tdzp); ZFS_VERIFY_ZP(sdzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { error = SET_ERROR(EILSEQ); goto out; } /* If source and target are the same file, there is nothing to do. */ if ((*svpp) == (*tvpp)) { error = 0; goto out; } if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && (*tvpp)->v_mountedhere != NULL)) { error = SET_ERROR(EXDEV); goto out; } szp = VTOZ(*svpp); ZFS_VERIFY_ZP(szp); tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); if (tzp != NULL) ZFS_VERIFY_ZP(tzp); /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { error = SET_ERROR(EINVAL); goto out; } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto out; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) goto out; if ((*svpp)->v_type == VDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || sdzp == szp || (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto out; } /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_check(szp, sdzp, tdzp))) goto out; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if ((*svpp)->v_type == VDIR) { if ((*tvpp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } else { cache_purge(tdvp); if (sdvp != tdvp) cache_purge(sdvp); } } else { if ((*tvpp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto out; } } } vn_seqc_write_begin(*svpp); vn_seqc_write_begin(sdvp); if (*tvpp != NULL) vn_seqc_write_begin(*tvpp); if (tdvp != *tvpp) vn_seqc_write_begin(tdvp); vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out_seq; } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME, sdzp, snm, tdzp, tnm, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY0(zfs_link_destroy(tdzp, tnm, szp, tx, ZRENAMING, NULL)); } } if (error == 0) { cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp); } } dmu_tx_commit(tx); out_seq: vn_seqc_write_end(*svpp); vn_seqc_write_end(sdvp); if (*tvpp != NULL) vn_seqc_write_end(*tvpp); if (tdvp != *tvpp) vn_seqc_write_end(tdvp); out: if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } int zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname, cred_t *cr, int flags) { struct componentname scn, tcn; vnode_t *sdvp, *tdvp; vnode_t *svp, *tvp; int error; svp = tvp = NULL; sdvp = ZTOV(sdzp); tdvp = ZTOV(tdzp); error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE); if (sdzp->z_zfsvfs->z_replay == B_FALSE) VOP_UNLOCK1(sdvp); if (error != 0) goto fail; VOP_UNLOCK1(svp); vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME); if (error == EJUSTRETURN) tvp = NULL; else if (error != 0) { VOP_UNLOCK1(tdvp); goto fail; } error = zfs_do_rename(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr); fail: if (svp != NULL) vrele(svp); if (tvp != NULL) vrele(tvp); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, const char *link, znode_t **zpp, cred_t *cr, int flags) { (void) flags; znode_t *zp; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; ASSERT3S(vap->va_type, ==, VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0 /* projid */)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve_(); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datasets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), __DECONST(void *, link), len, tx); else zfs_sa_symlink(zp, __DECONST(char *, link), len, tx); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dzp, name, zp, tx, ZNEW); zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *zpp = zp; zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ static int zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct) { (void) cr, (void) ct; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ int zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, int flags) { (void) flags; znode_t *tzp; zfsvfs_t *zfsvfs = tdzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; uint64_t parent; uid_t owner; ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(tdzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (ZTOV(szp)->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } ZFS_VERIFY_ZP(szp); /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(tdzp, name, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } dmu_tx_commit(tx); if (error == 0) { vnevent_link(ZTOV(szp), ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: ip - inode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * ip - ctime|mtime updated */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { (void) offset; zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (cmd != F_FREESP) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); ZFS_EXIT(zfsvfs); return (error); } static void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { (void) cr, (void) ct; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); vrecycle(vp); return; } if (zp->z_unlinked) { /* * Fast path to recycle a vnode of a removed file. */ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); vrecycle(vp); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; dmu_tx_commit(tx); } } ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); } _Static_assert(sizeof (struct zfid_short) <= sizeof (struct fid), "struct zfid_short bigger than struct fid"); _Static_assert(sizeof (struct zfid_long) <= sizeof (struct fid), "struct zfid_long bigger than struct fid"); static int zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { (void) ct; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; fidp->fid_len = size; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { znode_t *zp; zfsvfs_t *zfsvfs; switch (cmd) { case _PC_LINK_MAX: *valp = MIN(LONG_MAX, ZFS_LINK_MAX); return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); case _PC_ACL_EXTENDED: #if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0; ZFS_EXIT(zfsvfs); #else *valp = 0; #endif return (0); case _PC_ACL_NFS4: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0; ZFS_EXIT(zfsvfs); return (0); case _PC_ACL_PATH_MAX: *valp = ACL_MAX_ENTRIES; return (0); default: return (EOPNOTSUPP); } } static int zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_locked_range_t *lr; vm_object_t object; off_t start, end, obj_size; uint_t blksz; int pgsin_b, pgsin_a; int error; ZFS_ENTER_ERROR(zfsvfs, zfs_vm_pagerret_error); ZFS_VERIFY_ZP_ERROR(zp, zfs_vm_pagerret_error); start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); /* * Lock a range covering all required and optional pages. * Note that we need to handle the case of the block size growing. */ for (;;) { blksz = zp->z_blksz; lr = zfs_rangelock_tryenter(&zp->z_rangelock, rounddown(start, blksz), roundup(end, blksz) - rounddown(start, blksz), RL_READER); if (lr == NULL) { if (rahead != NULL) { *rahead = 0; rahead = NULL; } if (rbehind != NULL) { *rbehind = 0; rbehind = NULL; } break; } if (blksz == zp->z_blksz) break; zfs_rangelock_exit(lr); } object = ma[0]->object; zfs_vmobject_wlock(object); obj_size = object->un_pager.vnp.vnp_size; zfs_vmobject_wunlock(object); if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { if (lr != NULL) zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (zfs_vm_pagerret_bad); } pgsin_b = 0; if (rbehind != NULL) { pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); pgsin_b = MIN(*rbehind, pgsin_b); } pgsin_a = 0; if (rahead != NULL) { pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); if (end + IDX_TO_OFF(pgsin_a) >= obj_size) pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); pgsin_a = MIN(*rahead, pgsin_a); } /* * NB: we need to pass the exact byte size of the data that we expect * to read after accounting for the file size. This is required because * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); if (lr != NULL) zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE); ZFS_EXIT(zfsvfs); if (error != 0) return (zfs_vm_pagerret_error); VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); if (rbehind != NULL) *rbehind = pgsin_b; if (rahead != NULL) *rahead = pgsin_a; return (zfs_vm_pagerret_ok); } #ifndef _SYS_SYSPROTO_H_ struct vop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; }; #endif static int zfs_freebsd_getpages(struct vop_getpages_args *ap) { return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead)); } static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_locked_range_t *lr; dmu_tx_t *tx; struct sf_buf *sf; vm_object_t object; vm_page_t m; caddr_t va; size_t tocopy; size_t lo_len; vm_ooffset_t lo_off; vm_ooffset_t off; uint_t blksz; int ncount; int pcount; int err; int i; object = vp->v_object; KASSERT(ma[0]->object == object, ("mismatching object")); KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); pcount = btoc(len); ncount = pcount; for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; ZFS_ENTER_ERROR(zfsvfs, zfs_vm_pagerret_error); ZFS_VERIFY_ZP_ERROR(zp, zfs_vm_pagerret_error); off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; lo_off = rounddown(off, blksz); lo_len = roundup(len + (off - lo_off), blksz); lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER); zfs_vmobject_wlock(object); if (len + off > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > off) { int pgoff; len = object->un_pager.vnp.vnp_size - off; ncount = btoc(len); if ((pgoff = (int)len & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("zfs_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { len = 0; ncount = 0; } if (ncount < pcount) { for (i = ncount; i < pcount; i++) { rtvals[i] = zfs_vm_pagerret_bad; } } } zfs_vmobject_wunlock(object); if (ncount == 0) goto out; if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { goto out; } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); goto out; } if (zp->z_blksz < PAGE_SIZE) { for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; va = zfs_map_page(ma[i], &sf); dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); zfs_unmap_page(sf); } } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); /* * XXX we should be passing a callback to undirty * but that would make the locking messier */ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0, NULL, NULL); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { rtvals[i] = zfs_vm_pagerret_ok; vm_page_undirty(ma[i]); } zfs_vmobject_wunlock(object); VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); } dmu_tx_commit(tx); out: zfs_rangelock_exit(lr); if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len); ZFS_EXIT(zfsvfs); return (rtvals[0]); } #ifndef _SYS_SYSPROTO_H_ struct vop_putpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; }; #endif static int zfs_freebsd_putpages(struct vop_putpages_args *ap) { return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals)); } #ifndef _SYS_SYSPROTO_H_ struct vop_bmap_args { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; }; #endif static int zfs_freebsd_bmap(struct vop_bmap_args *ap) { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; }; #endif static int zfs_freebsd_open(struct vop_open_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred); if (error == 0) vnode_create_vobject(vp, zp->z_size, ap->a_td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int zfs_freebsd_close(struct vop_close_args *ap) { return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_ioctl_args { struct vnode *a_vp; ulong_t a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_ioctl(struct vop_ioctl_args *ap) { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL)); } static int ioflags(int ioflags) { int flags = 0; if (ioflags & IO_APPEND) flags |= O_APPEND; if (ioflags & IO_NDELAY) flags |= O_NONBLOCK; if (ioflags & IO_SYNC) flags |= O_SYNC; return (flags); } #ifndef _SYS_SYSPROTO_H_ struct vop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif static int zfs_freebsd_read(struct vop_read_args *ap) { zfs_uio_t uio; zfs_uio_init(&uio, ap->a_uio); return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), ap->a_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif static int zfs_freebsd_write(struct vop_write_args *ap) { zfs_uio_t uio; zfs_uio_init(&uio, ap->a_uio); return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), ap->a_cred)); } #if __FreeBSD_version >= 1300102 /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ static int zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v) { vnode_t *vp; znode_t *zp; uint64_t pflags; vp = v->a_vp; zp = VTOZ_SMR(vp); if (__predict_false(zp == NULL)) return (EAGAIN); pflags = atomic_load_64(&zp->z_pflags); if (pflags & ZFS_AV_QUARANTINED) return (EAGAIN); if (pflags & ZFS_XATTR) return (EAGAIN); if ((pflags & ZFS_NO_EXECS_DENIED) == 0) return (EAGAIN); return (0); } #endif #if __FreeBSD_version >= 1300139 static int zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v) { vnode_t *vp; znode_t *zp; char *target; vp = v->a_vp; zp = VTOZ_SMR(vp); if (__predict_false(zp == NULL)) { return (EAGAIN); } target = atomic_load_consume_ptr(&zp->z_cached_symlink); if (target == NULL) { return (EAGAIN); } return (cache_symlink_resolve(v->a_fpl, target, strlen(target))); } #endif #ifndef _SYS_SYSPROTO_H_ struct vop_access_args { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif static int zfs_freebsd_access(struct vop_access_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); accmode_t accmode; int error = 0; if (ap->a_accmode == VEXEC) { if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0) return (0); } /* * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) error = zfs_access(zp, accmode, 0, ap->a_cred); /* * VADMIN has to be handled by vaccess(). */ if (error == 0) { accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) { #if __FreeBSD_version >= 1300105 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred); #else error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred, NULL); #endif } } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. */ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { error = EACCES; } return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_lookup_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT3U(cnp->cn_namelen, <, sizeof (nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, 0, cached)); } static int zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) { return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE)); } #ifndef _SYS_SYSPROTO_H_ struct vop_lookup_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif static int zfs_cache_lookup(struct vop_lookup_args *ap) { zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; if (zfsvfs->z_use_namecache) return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap, B_FALSE)); } #ifndef _SYS_SYSPROTO_H_ struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif static int zfs_freebsd_create(struct vop_create_args *ap) { zfsvfs_t *zfsvfs; struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; znode_t *zp = NULL; int rc, mode; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; zfsvfs = ap->a_dvp->v_mount->mnt_data; *ap->a_vpp = NULL; rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode, &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */); if (rc == 0) *ap->a_vpp = ZTOV(zp); if (zfsvfs->z_use_namecache && rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (rc); } #ifndef _SYS_SYSPROTO_H_ struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_remove(struct vop_remove_args *ap) { +#if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); +#endif return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_mkdir_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif static int zfs_freebsd_mkdir(struct vop_mkdir_args *ap) { vattr_t *vap = ap->a_vap; znode_t *zp = NULL; int rc; +#if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); +#endif vattr_init_mask(vap); *ap->a_vpp = NULL; rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp, ap->a_cnp->cn_cred, 0, NULL); if (rc == 0) *ap->a_vpp = ZTOV(zp); return (rc); } #ifndef _SYS_SYSPROTO_H_ struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_rmdir(struct vop_rmdir_args *ap) { struct componentname *cnp = ap->a_cnp; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; cookie_t **a_cookies; }; #endif static int zfs_freebsd_readdir(struct vop_readdir_args *ap) { zfs_uio_t uio; zfs_uio_init(&uio, ap->a_uio); return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } #ifndef _SYS_SYSPROTO_H_ struct vop_fsync_args { struct vnode *a_vp; int a_waitfor; struct thread *a_td; }; #endif static int zfs_freebsd_fsync(struct vop_fsync_args *ap) { vop_stdfsync(ap); return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif static int zfs_freebsd_getattr(struct vop_getattr_args *ap) { vattr_t *vap = ap->a_vap; xvattr_t xvap; ulong_t fflags = 0; int error; xva_init(&xvap); xvap.xva_vattr = *vap; xvap.xva_vattr.va_mask |= AT_XVATTR; /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ XVA_SET_REQ(&xvap, XAT_IMMUTABLE); XVA_SET_REQ(&xvap, XAT_APPENDONLY); XVA_SET_REQ(&xvap, XAT_NOUNLINK); XVA_SET_REQ(&xvap, XAT_NODUMP); XVA_SET_REQ(&xvap, XAT_READONLY); XVA_SET_REQ(&xvap, XAT_ARCHIVE); XVA_SET_REQ(&xvap, XAT_SYSTEM); XVA_SET_REQ(&xvap, XAT_HIDDEN); XVA_SET_REQ(&xvap, XAT_REPARSE); XVA_SET_REQ(&xvap, XAT_OFFLINE); XVA_SET_REQ(&xvap, XAT_SPARSE); error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred); if (error != 0) return (error); /* Convert ZFS xattr into chflags. */ #define FLAG_CHECK(fflag, xflag, xfield) do { \ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ fflags |= (fflag); \ } while (0) FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHECK(UF_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHECK(UF_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHECK(UF_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHECK(UF_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif static int zfs_freebsd_setattr(struct vop_setattr_args *ap) { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; cred_t *cred = ap->a_cred; xvattr_t xvap; ulong_t fflags; uint64_t zflags; vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; xva_init(&xvap); xvap.xva_vattr = *vap; zflags = VTOZ(vp)->z_pflags; if (vap->va_flags != VNOVAL) { zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; int error; if (zfsvfs->z_use_fuids == B_FALSE) return (EOPNOTSUPP); fflags = vap->va_flags; /* * XXX KDM * We need to figure out whether it makes sense to allow * UF_REPARSE through, since we don't really have other * facilities to handle reparse points and zfs_setattr() * doesn't currently allow setting that attribute anyway. */ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| UF_OFFLINE|UF_SPARSE)) != 0) return (EOPNOTSUPP); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the PR_ALLOW_CHFLAGS permission bit is set; * otherwise, they behave like unprivileged processes. */ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { error = securelevel_gt(cred, 0); if (error != 0) return (error); } } else { /* * Callers may only modify the file flags on * objects they have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) return (error); if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { return (EPERM); } if (fflags & (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { return (EPERM); } } #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ if (((fflags & (fflag)) && !(zflags & (zflag))) || \ ((zflags & (zflag)) && !(fflags & (fflag)))) { \ XVA_SET_REQ(&xvap, (xflag)); \ (xfield) = ((fflags & (fflag)) != 0); \ } \ } while (0) /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHANGE } if (vap->va_birthtime.tv_sec != VNOVAL) { xvap.xva_vattr.va_mask |= AT_XVATTR; XVA_SET_REQ(&xvap, XAT_CREATETIME); } return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; }; #endif static int zfs_freebsd_rename(struct vop_rename_args *ap) { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; +#if __FreeBSD_version < 1400068 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); +#endif error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; }; #endif static int zfs_freebsd_symlink(struct vop_symlink_args *ap) { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; znode_t *zp = NULL; #if __FreeBSD_version >= 1300139 char *symlink; size_t symlink_len; #endif int rc; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); *ap->a_vpp = NULL; rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, ap->a_target, &zp, cnp->cn_cred, 0 /* flags */); if (rc == 0) { *ap->a_vpp = ZTOV(zp); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); #if __FreeBSD_version >= 1300139 MPASS(zp->z_cached_symlink == NULL); symlink_len = strlen(ap->a_target); symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK); if (symlink != NULL) { memcpy(symlink, ap->a_target, symlink_len); symlink[symlink_len] = '\0'; atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink, (uintptr_t)symlink); } #endif } return (rc); } #ifndef _SYS_SYSPROTO_H_ struct vop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; }; #endif static int zfs_freebsd_readlink(struct vop_readlink_args *ap) { zfs_uio_t uio; int error; #if __FreeBSD_version >= 1300139 znode_t *zp = VTOZ(ap->a_vp); char *symlink, *base; size_t symlink_len; bool trycache; #endif zfs_uio_init(&uio, ap->a_uio); #if __FreeBSD_version >= 1300139 trycache = false; if (zfs_uio_segflg(&uio) == UIO_SYSSPACE && zfs_uio_iovcnt(&uio) == 1) { base = zfs_uio_iovbase(&uio, 0); symlink_len = zfs_uio_iovlen(&uio, 0); trycache = true; } #endif error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL); #if __FreeBSD_version >= 1300139 if (atomic_load_ptr(&zp->z_cached_symlink) != NULL || error != 0 || !trycache) { return (error); } symlink_len -= zfs_uio_resid(&uio); symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK); if (symlink != NULL) { memcpy(symlink, base, symlink_len); symlink[symlink_len] = '\0'; if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink, (uintptr_t)NULL, (uintptr_t)symlink)) { cache_symlink_free(symlink, symlink_len + 1); } } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_link(struct vop_link_args *ap) { struct componentname *cnp = ap->a_cnp; vnode_t *vp = ap->a_vp; vnode_t *tdvp = ap->a_tdvp; if (tdvp->v_mount != vp->v_mount) return (EXDEV); +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif return (zfs_link(VTOZ(tdvp), VTOZ(vp), cnp->cn_nameptr, cnp->cn_cred, 0)); } #ifndef _SYS_SYSPROTO_H_ struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int zfs_freebsd_inactive(struct vop_inactive_args *ap) { vnode_t *vp = ap->a_vp; #if __FreeBSD_version >= 1300123 zfs_inactive(vp, curthread->td_ucred, NULL); #else zfs_inactive(vp, ap->a_td->td_ucred, NULL); #endif return (0); } #if __FreeBSD_version >= 1300042 #ifndef _SYS_SYSPROTO_H_ struct vop_need_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int need; if (vn_need_pageq_flush(vp)) return (1); if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs)) return (1); need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty); ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); return (need); } #endif #ifndef _SYS_SYSPROTO_H_ struct vop_reclaim_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int zfs_freebsd_reclaim(struct vop_reclaim_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT3P(zp, !=, NULL); #if __FreeBSD_version < 1300042 /* Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); #endif /* * z_teardown_inactive_lock protects from a race with * zfs_znode_dmu_fini in zfsvfs_teardown during * force unmount. */ ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); vp->v_data = NULL; return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_fid_args { struct vnode *a_vp; struct fid *a_fid; }; #endif static int zfs_freebsd_fid(struct vop_fid_args *ap) { return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct vop_pathconf_args { struct vnode *a_vp; int a_name; register_t *a_retval; } *ap; #endif static int zfs_freebsd_pathconf(struct vop_pathconf_args *ap) { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); if (error == 0) { *ap->a_retval = val; return (error); } if (error != EOPNOTSUPP) return (error); switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); #if __FreeBSD_version >= 1400032 case _PC_DEALLOC_PRESENT: *ap->a_retval = 1; return (0); #endif case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { *ap->a_retval = PIPE_BUF; return (0); } return (EINVAL); default: return (vop_stdpathconf(ap)); } } static int zfs_xattr_compat = 1; static int zfs_check_attrname(const char *name) { /* We don't allow '/' character in attribute name. */ if (strchr(name, '/') != NULL) return (SET_ERROR(EINVAL)); /* We don't allow attribute names that start with a namespace prefix. */ if (ZFS_XA_NS_PREFIX_FORBIDDEN(name)) return (SET_ERROR(EINVAL)); return (0); } /* * FreeBSD's extended attributes namespace defines file name prefix for ZFS' * extended attribute name: * * NAMESPACE XATTR_COMPAT PREFIX * system * freebsd:system: * user 1 (none, can be used to access ZFS * fsattr(5) attributes created on Solaris) * user 0 user. */ static int zfs_create_attrname(int attrnamespace, const char *name, char *attrname, size_t size, boolean_t compat) { const char *namespace, *prefix, *suffix; memset(attrname, 0, size); switch (attrnamespace) { case EXTATTR_NAMESPACE_USER: if (compat) { /* * This is the default namespace by which we can access * all attributes created on Solaris. */ prefix = namespace = suffix = ""; } else { /* * This is compatible with the user namespace encoding * on Linux prior to xattr_compat, but nothing * else. */ prefix = ""; namespace = "user"; suffix = "."; } break; case EXTATTR_NAMESPACE_SYSTEM: prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; suffix = ":"; break; case EXTATTR_NAMESPACE_EMPTY: default: return (SET_ERROR(EINVAL)); } if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, name) >= size) { return (SET_ERROR(ENAMETOOLONG)); } return (0); } static int zfs_ensure_xattr_cached(znode_t *zp) { int error = 0; ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); if (zp->z_xattr_cached != NULL) return (0); if (rw_write_held(&zp->z_xattr_lock)) return (zfs_sa_get_xattr(zp)); if (!rw_tryupgrade(&zp->z_xattr_lock)) { rw_exit(&zp->z_xattr_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); } if (zp->z_xattr_cached == NULL) error = zfs_sa_get_xattr(zp); rw_downgrade(&zp->z_xattr_lock); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif static int zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname) { struct thread *td = ap->a_td; struct nameidata nd; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, LOOKUP_XATTR, B_FALSE); if (error != 0) return (error); flags = FREAD; #if __FreeBSD_version < 1400043 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); #else NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp); #endif error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE_PNBUF(&nd); if (error != 0) return (SET_ERROR(error)); if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); if (error == 0) *ap->a_size = (size_t)va.va_size; } else if (ap->a_uio != NULL) error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK1(vp); vn_close(vp, flags, ap->a_cred, td); return (error); } static int zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname) { znode_t *zp = VTOZ(ap->a_vp); uchar_t *nv_value; uint_t nv_size; int error; error = zfs_ensure_xattr_cached(zp); if (error != 0) return (error); ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); ASSERT3P(zp->z_xattr_cached, !=, NULL); error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname, &nv_value, &nv_size); if (error != 0) return (SET_ERROR(error)); if (ap->a_size != NULL) *ap->a_size = nv_size; else if (ap->a_uio != NULL) error = uiomove(nv_value, nv_size, ap->a_uio); if (error != 0) return (SET_ERROR(error)); return (0); } static int zfs_getextattr_impl(struct vop_getextattr_args *ap, boolean_t compat) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); char attrname[EXTATTR_MAXNAMELEN+1]; int error; error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname), compat); if (error != 0) return (error); error = ENOENT; if (zfsvfs->z_use_sa && zp->z_is_sa) error = zfs_getextattr_sa(ap, attrname); if (error == ENOENT) error = zfs_getextattr_dir(ap, attrname); return (error); } /* * Vnode operation to retrieve a named extended attribute. */ static int zfs_getextattr(struct vop_getextattr_args *ap) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) return (SET_ERROR(EOPNOTSUPP)); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (SET_ERROR(error)); error = zfs_check_attrname(ap->a_name); if (error != 0) return (error); error = ENOENT; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_READER); error = zfs_getextattr_impl(ap, zfs_xattr_compat); if ((error == ENOENT || error == ENOATTR) && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) { /* * Fall back to the alternate namespace format if we failed to * find a user xattr. */ error = zfs_getextattr_impl(ap, !zfs_xattr_compat); } rw_exit(&zp->z_xattr_lock); ZFS_EXIT(zfsvfs); if (error == ENOENT) error = SET_ERROR(ENOATTR); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif static int zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname) { struct nameidata nd; vnode_t *xvp = NULL, *vp; int error; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, LOOKUP_XATTR, B_FALSE); if (error != 0) return (error); #if __FreeBSD_version < 1400043 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, attrname, xvp, ap->a_td); #else NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, attrname, xvp); #endif error = namei(&nd); vp = nd.ni_vp; if (error != 0) { NDFREE_PNBUF(&nd); return (SET_ERROR(error)); } error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE_PNBUF(&nd); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); return (error); } static int zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname) { znode_t *zp = VTOZ(ap->a_vp); nvlist_t *nvl; int error; error = zfs_ensure_xattr_cached(zp); if (error != 0) return (error); ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); ASSERT3P(zp->z_xattr_cached, !=, NULL); nvl = zp->z_xattr_cached; error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY); if (error != 0) error = SET_ERROR(error); else error = zfs_sa_set_xattr(zp, attrname, NULL, 0); if (error != 0) { zp->z_xattr_cached = NULL; nvlist_free(nvl); } return (error); } static int zfs_deleteextattr_impl(struct vop_deleteextattr_args *ap, boolean_t compat) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); char attrname[EXTATTR_MAXNAMELEN+1]; int error; error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname), compat); if (error != 0) return (error); error = ENOENT; if (zfsvfs->z_use_sa && zp->z_is_sa) error = zfs_deleteextattr_sa(ap, attrname); if (error == ENOENT) error = zfs_deleteextattr_dir(ap, attrname); return (error); } /* * Vnode operation to remove a named attribute. */ static int zfs_deleteextattr(struct vop_deleteextattr_args *ap) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) return (SET_ERROR(EOPNOTSUPP)); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (SET_ERROR(error)); error = zfs_check_attrname(ap->a_name); if (error != 0) return (error); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_WRITER); error = zfs_deleteextattr_impl(ap, zfs_xattr_compat); if ((error == ENOENT || error == ENOATTR) && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) { /* * Fall back to the alternate namespace format if we failed to * find a user xattr. */ error = zfs_deleteextattr_impl(ap, !zfs_xattr_compat); } rw_exit(&zp->z_xattr_lock); ZFS_EXIT(zfsvfs); if (error == ENOENT) error = SET_ERROR(ENOATTR); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif static int zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname) { struct thread *td = ap->a_td; struct nameidata nd; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE); if (error != 0) return (error); flags = FFLAGS(O_WRONLY | O_CREAT); #if __FreeBSD_version < 1400043 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); #else NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp); #endif error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE_PNBUF(&nd); if (error != 0) return (SET_ERROR(error)); VATTR_NULL(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, ap->a_cred); if (error == 0) VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK1(vp); vn_close(vp, flags, ap->a_cred, td); return (error); } static int zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname) { znode_t *zp = VTOZ(ap->a_vp); nvlist_t *nvl; size_t sa_size; int error; error = zfs_ensure_xattr_cached(zp); if (error != 0) return (error); ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); ASSERT3P(zp->z_xattr_cached, !=, NULL); nvl = zp->z_xattr_cached; size_t entry_size = ap->a_uio->uio_resid; if (entry_size > DXATTR_MAX_ENTRY_SIZE) return (SET_ERROR(EFBIG)); error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); if (error != 0) return (SET_ERROR(error)); if (sa_size > DXATTR_MAX_SA_SIZE) return (SET_ERROR(EFBIG)); uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP); error = uiomove(buf, entry_size, ap->a_uio); if (error != 0) { error = SET_ERROR(error); } else { error = nvlist_add_byte_array(nvl, attrname, buf, entry_size); if (error != 0) error = SET_ERROR(error); } if (error == 0) error = zfs_sa_set_xattr(zp, attrname, buf, entry_size); kmem_free(buf, entry_size); if (error != 0) { zp->z_xattr_cached = NULL; nvlist_free(nvl); } return (error); } static int zfs_setextattr_impl(struct vop_setextattr_args *ap, boolean_t compat) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); char attrname[EXTATTR_MAXNAMELEN+1]; int error; error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname), compat); if (error != 0) return (error); struct vop_deleteextattr_args vda = { .a_vp = ap->a_vp, .a_attrnamespace = ap->a_attrnamespace, .a_name = ap->a_name, .a_cred = ap->a_cred, .a_td = ap->a_td, }; error = ENOENT; if (zfsvfs->z_use_sa && zp->z_is_sa && zfsvfs->z_xattr_sa) { error = zfs_setextattr_sa(ap, attrname); if (error == 0) { /* * Successfully put into SA, we need to clear the one * in dir if present. */ zfs_deleteextattr_dir(&vda, attrname); } } if (error != 0) { error = zfs_setextattr_dir(ap, attrname); if (error == 0 && zp->z_is_sa) { /* * Successfully put into dir, we need to clear the one * in SA if present. */ zfs_deleteextattr_sa(&vda, attrname); } } if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) { /* * Also clear all versions of the alternate compat name. */ zfs_deleteextattr_impl(&vda, !compat); } return (error); } /* * Vnode operation to set a named attribute. */ static int zfs_setextattr(struct vop_setextattr_args *ap) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) return (SET_ERROR(EOPNOTSUPP)); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (SET_ERROR(error)); error = zfs_check_attrname(ap->a_name); if (error != 0) return (error); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_WRITER); error = zfs_setextattr_impl(ap, zfs_xattr_compat); rw_exit(&zp->z_xattr_lock); ZFS_EXIT(zfsvfs); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif static int zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix) { struct thread *td = ap->a_td; struct nameidata nd; uint8_t dirbuf[sizeof (struct dirent)]; struct iovec aiov; struct uio auio; vnode_t *xvp = NULL, *vp; int error, eof; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, LOOKUP_XATTR, B_FALSE); if (error != 0) { /* * ENOATTR means that the EA directory does not yet exist, * i.e. there are no extended attributes there. */ if (error == ENOATTR) error = 0; return (error); } #if __FreeBSD_version < 1400043 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, ".", xvp, td); #else NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, ".", xvp); #endif error = namei(&nd); vp = nd.ni_vp; NDFREE_PNBUF(&nd); if (error != 0) return (SET_ERROR(error)); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_rw = UIO_READ; auio.uio_offset = 0; size_t plen = strlen(attrprefix); do { aiov.iov_base = (void *)dirbuf; aiov.iov_len = sizeof (dirbuf); auio.uio_resid = sizeof (dirbuf); error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); if (error != 0) break; int done = sizeof (dirbuf) - auio.uio_resid; for (int pos = 0; pos < done; ) { struct dirent *dp = (struct dirent *)(dirbuf + pos); pos += dp->d_reclen; /* * XXX: Temporarily we also accept DT_UNKNOWN, as this * is what we get when attribute was created on Solaris. */ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) continue; else if (plen == 0 && ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name)) continue; else if (strncmp(dp->d_name, attrprefix, plen) != 0) continue; uint8_t nlen = dp->d_namlen - plen; if (ap->a_size != NULL) { *ap->a_size += 1 + nlen; } else if (ap->a_uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, ap->a_uio); if (error == 0) { char *namep = dp->d_name + plen; error = uiomove(namep, nlen, ap->a_uio); } if (error != 0) { error = SET_ERROR(error); break; } } } } while (!eof && error == 0); vput(vp); return (error); } static int zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix) { znode_t *zp = VTOZ(ap->a_vp); int error; error = zfs_ensure_xattr_cached(zp); if (error != 0) return (error); ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); ASSERT3P(zp->z_xattr_cached, !=, NULL); size_t plen = strlen(attrprefix); nvpair_t *nvp = NULL; while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); const char *name = nvpair_name(nvp); if (plen == 0 && ZFS_XA_NS_PREFIX_FORBIDDEN(name)) continue; else if (strncmp(name, attrprefix, plen) != 0) continue; uint8_t nlen = strlen(name) - plen; if (ap->a_size != NULL) { *ap->a_size += 1 + nlen; } else if (ap->a_uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, ap->a_uio); if (error == 0) { char *namep = __DECONST(char *, name) + plen; error = uiomove(namep, nlen, ap->a_uio); } if (error != 0) { error = SET_ERROR(error); break; } } } return (error); } static int zfs_listextattr_impl(struct vop_listextattr_args *ap, boolean_t compat) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); char attrprefix[16]; int error; error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, sizeof (attrprefix), compat); if (error != 0) return (error); if (zfsvfs->z_use_sa && zp->z_is_sa) error = zfs_listextattr_sa(ap, attrprefix); if (error == 0) error = zfs_listextattr_dir(ap, attrprefix); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int zfs_listextattr(struct vop_listextattr_args *ap) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; if (ap->a_size != NULL) *ap->a_size = 0; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) return (SET_ERROR(EOPNOTSUPP)); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (SET_ERROR(error)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_READER); error = zfs_listextattr_impl(ap, zfs_xattr_compat); if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) { /* Also list user xattrs with the alternate format. */ error = zfs_listextattr_impl(ap, !zfs_xattr_compat); } rw_exit(&zp->z_xattr_lock); ZFS_EXIT(zfsvfs); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_getacl_args { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_getacl(struct vop_getacl_args *ap) { int error; vsecattr_t vsecattr; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; if ((error = zfs_getsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred))) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); if (vsecattr.vsa_aclentp != NULL) kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_setacl_args { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_setacl(struct vop_setacl_args *ap) { int error; vsecattr_t vsecattr; int aclbsize; /* size of acl list in bytes */ aclent_t *aaclp; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); if (ap->a_aclp == NULL) return (EINVAL); if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) return (EINVAL); /* * With NFSv4 ACLs, chmod(2) may need to add additional entries, * splitting every entry into two and appending "canonical six" * entries at the end. Don't allow for setting an ACL that would * cause chmod(2) to run out of ACL entries. */ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) return (ENOSPC); error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); if (error != 0) return (error); vsecattr.vsa_mask = VSA_ACE; aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t); vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); aaclp = vsecattr.vsa_aclentp; vsecattr.vsa_aclentsz = aclbsize; aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred); kmem_free(aaclp, aclbsize); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_aclcheck_args { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap) { return (EOPNOTSUPP); } static int zfs_vptocnp(struct vop_vptocnp_args *ap) { vnode_t *covered_vp; vnode_t *vp = ap->a_vp; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *zp = VTOZ(vp); int ltype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * If we are a snapshot mounted under .zfs, run the operation * on the covered vnode. */ if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { char name[MAXNAMLEN + 1]; znode_t *dzp; size_t len; error = zfs_znode_parent_and_name(zp, &dzp, name); if (error == 0) { len = strlen(name); if (*ap->a_buflen < len) error = SET_ERROR(ENOMEM); } if (error == 0) { *ap->a_buflen -= len; memcpy(ap->a_buf + *ap->a_buflen, name, len); *ap->a_vpp = ZTOV(dzp); } ZFS_EXIT(zfsvfs); return (error); } ZFS_EXIT(zfsvfs); covered_vp = vp->v_mount->mnt_vnodecovered; #if __FreeBSD_version >= 1300045 enum vgetstate vs = vget_prep(covered_vp); #else vhold(covered_vp); #endif ltype = VOP_ISLOCKED(vp); VOP_UNLOCK1(vp); #if __FreeBSD_version >= 1300045 error = vget_finish(covered_vp, LK_SHARED, vs); #else error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); #endif if (error == 0) { #if __FreeBSD_version >= 1300123 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf, ap->a_buflen); #else error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, ap->a_buf, ap->a_buflen); #endif vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); if (VN_IS_DOOMED(vp)) error = SET_ERROR(ENOENT); return (error); } #if __FreeBSD_version >= 1400032 static int zfs_deallocate(struct vop_deallocate_args *ap) { znode_t *zp = VTOZ(ap->a_vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; off_t off, len, file_sz; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } zilog = zfsvfs->z_log; off = *ap->a_offset; len = *ap->a_len; file_sz = zp->z_size; if (off + len > file_sz) len = file_sz - off; /* Fast path for out-of-range request. */ if (len <= 0) { *ap->a_len = 0; ZFS_EXIT(zfsvfs); return (0); } error = zfs_freesp(zp, off, len, O_RDWR, TRUE); if (error == 0) { if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS || (ap->a_ioflag & IO_SYNC) != 0) zil_commit(zilog, zp->z_id); *ap->a_offset = off + len; *ap->a_len = 0; } ZFS_EXIT(zfsvfs); return (error); } #endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, #if __FreeBSD_version >= 1300042 .vop_need_inactive = zfs_freebsd_need_inactive, #endif .vop_reclaim = zfs_freebsd_reclaim, #if __FreeBSD_version >= 1300102 .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, #endif #if __FreeBSD_version >= 1300139 .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink, #endif .vop_access = zfs_freebsd_access, .vop_allocate = VOP_EINVAL, #if __FreeBSD_version >= 1400032 .vop_deallocate = zfs_deallocate, #endif .vop_lookup = zfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_cachedlookup, .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = (vop_mknod_t *)zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = zfs_freebsd_bmap, .vop_fid = zfs_freebsd_fid, .vop_getextattr = zfs_getextattr, .vop_deleteextattr = zfs_deleteextattr, .vop_setextattr = zfs_setextattr, .vop_listextattr = zfs_listextattr, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, .vop_vptocnp = zfs_vptocnp, #if __FreeBSD_version >= 1300064 .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, #endif #if __FreeBSD_version >= 1400043 .vop_add_writecount = vop_stdadd_writecount_nomsync, #endif }; VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = zfs_freebsd_fsync, #if __FreeBSD_version >= 1300102 .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, #endif #if __FreeBSD_version >= 1300139 .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink, #endif .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_pathconf = zfs_freebsd_pathconf, .vop_fid = zfs_freebsd_fid, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, #if __FreeBSD_version >= 1400043 .vop_add_writecount = vop_stdadd_writecount_nomsync, #endif }; VFS_VOP_VECTOR_REGISTER(zfs_fifoops); /* * special share hidden files vnode operations template */ struct vop_vector zfs_shareops = { .vop_default = &default_vnodeops, #if __FreeBSD_version >= 1300121 .vop_fplookup_vexec = VOP_EAGAIN, #endif #if __FreeBSD_version >= 1300139 .vop_fplookup_symlink = VOP_EAGAIN, #endif .vop_access = zfs_freebsd_access, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_fid = zfs_freebsd_fid, .vop_pathconf = zfs_freebsd_pathconf, #if __FreeBSD_version >= 1400043 .vop_add_writecount = vop_stdadd_writecount_nomsync, #endif }; VFS_VOP_VECTOR_REGISTER(zfs_shareops); ZFS_MODULE_PARAM(zfs, zfs_, xattr_compat, INT, ZMOD_RW, "Use legacy ZFS xattr naming for writing new user namespace xattrs"); diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index 13619d318cfc..511430ccdd97 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -1,2112 +1,2111 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 * * $FreeBSD$ */ /* * TODO: * mkdir: want it ? */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct vop_vector devfs_vnodeops; static struct vop_vector devfs_specops; static struct fileops devfs_ops_f; #include #include #include #include #include #include static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data"); struct mtx devfs_de_interlock; MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF); struct mtx cdevpriv_mtx; MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF); SYSCTL_DECL(_vfs_devfs); static int devfs_dotimes; SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW, &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision"); /* * Update devfs node timestamp. Note that updates are unlocked and * stat(2) could see partially updated times. */ static void devfs_timestamp(struct timespec *tsp) { time_t ts; if (devfs_dotimes) { vfs_timestamp(tsp); } else { ts = time_second; if (tsp->tv_sec != ts) { tsp->tv_sec = ts; tsp->tv_nsec = 0; } } } static int devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp, int *ref) { *dswp = devvn_refthread(fp->f_vnode, devp, ref); if (*dswp == NULL || *devp != fp->f_data) { if (*dswp != NULL) dev_relthread(*devp, *ref); return (ENXIO); } KASSERT((*devp)->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp))); if (*dswp == NULL) return (ENXIO); curthread->td_fpop = fp; return (0); } int devfs_get_cdevpriv(void **datap) { struct file *fp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (EBADF); p = fp->f_cdevpriv; if (p != NULL) { error = 0; *datap = p->cdpd_data; } else error = ENOENT; return (error); } int devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr) { struct file *fp; struct cdev_priv *cdp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (ENOENT); cdp = cdev2priv((struct cdev *)fp->f_data); p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK); p->cdpd_data = priv; p->cdpd_dtr = priv_dtr; p->cdpd_fp = fp; mtx_lock(&cdevpriv_mtx); if (fp->f_cdevpriv == NULL) { LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list); fp->f_cdevpriv = p; mtx_unlock(&cdevpriv_mtx); error = 0; } else { mtx_unlock(&cdevpriv_mtx); free(p, M_CDEVPDATA); error = EBUSY; } return (error); } void devfs_destroy_cdevpriv(struct cdev_privdata *p) { mtx_assert(&cdevpriv_mtx, MA_OWNED); KASSERT(p->cdpd_fp->f_cdevpriv == p, ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p)); p->cdpd_fp->f_cdevpriv = NULL; LIST_REMOVE(p, cdpd_list); mtx_unlock(&cdevpriv_mtx); (p->cdpd_dtr)(p->cdpd_data); free(p, M_CDEVPDATA); } static void devfs_fpdrop(struct file *fp) { struct cdev_privdata *p; mtx_lock(&cdevpriv_mtx); if ((p = fp->f_cdevpriv) == NULL) { mtx_unlock(&cdevpriv_mtx); return; } devfs_destroy_cdevpriv(p); } void devfs_clear_cdevpriv(void) { struct file *fp; fp = curthread->td_fpop; if (fp == NULL) return; devfs_fpdrop(fp); } static void devfs_usecount_add(struct vnode *vp) { struct devfs_dirent *de; struct cdev *dev; mtx_lock(&devfs_de_interlock); VI_LOCK(vp); VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp); if (VN_IS_DOOMED(vp)) { goto out_unlock; } de = vp->v_data; dev = vp->v_rdev; MPASS(de != NULL); MPASS(dev != NULL); dev->si_usecount++; de->de_usecount++; out_unlock: VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); } static void devfs_usecount_subl(struct vnode *vp) { struct devfs_dirent *de; struct cdev *dev; mtx_assert(&devfs_de_interlock, MA_OWNED); ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp); de = vp->v_data; dev = vp->v_rdev; if (de == NULL) return; if (dev == NULL) { MPASS(de->de_usecount == 0); return; } if (dev->si_usecount < de->de_usecount) panic("%s: si_usecount underflow for dev %p " "(has %ld, dirent has %d)\n", __func__, dev, dev->si_usecount, de->de_usecount); if (VN_IS_DOOMED(vp)) { dev->si_usecount -= de->de_usecount; de->de_usecount = 0; } else { if (de->de_usecount == 0) panic("%s: de_usecount underflow for dev %p\n", __func__, dev); dev->si_usecount--; de->de_usecount--; } } static void devfs_usecount_sub(struct vnode *vp) { mtx_lock(&devfs_de_interlock); VI_LOCK(vp); devfs_usecount_subl(vp); VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); } static int devfs_usecountl(struct vnode *vp) { VNPASS(vp->v_type == VCHR, vp); mtx_assert(&devfs_de_interlock, MA_OWNED); ASSERT_VI_LOCKED(vp, __func__); return (vp->v_rdev->si_usecount); } int devfs_usecount(struct vnode *vp) { int count; VNPASS(vp->v_type == VCHR, vp); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); count = devfs_usecountl(vp); VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); return (count); } void devfs_ctty_ref(struct vnode *vp) { vrefact(vp); devfs_usecount_add(vp); } void devfs_ctty_unref(struct vnode *vp) { devfs_usecount_sub(vp); vrele(vp); } /* * On success devfs_populate_vp() returns with dmp->dm_lock held. */ static int devfs_populate_vp(struct vnode *vp) { struct devfs_dirent *de; struct devfs_mount *dmp; int locked; ASSERT_VOP_LOCKED(vp, "devfs_populate_vp"); dmp = VFSTODEVFS(vp->v_mount); if (!devfs_populate_needed(dmp)) { sx_xlock(&dmp->dm_lock); goto out_nopopulate; } locked = VOP_ISLOCKED(vp); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); /* Can't call devfs_populate() with the vnode lock held. */ VOP_UNLOCK(vp); devfs_populate(dmp); sx_xunlock(&dmp->dm_lock); vn_lock(vp, locked | LK_RETRY); sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ERESTART); } out_nopopulate: if (VN_IS_DOOMED(vp)) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } de = vp->v_data; KASSERT(de != NULL, ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed")); if ((de->de_flags & DE_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } return (0); } static int devfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct devfs_mount *dmp; char *buf = ap->a_buf; size_t *buflen = ap->a_buflen; struct devfs_dirent *dd, *de; int i, error; dmp = VFSTODEVFS(vp->v_mount); error = devfs_populate_vp(vp); if (error != 0) return (error); if (vp->v_type != VCHR && vp->v_type != VDIR) { error = ENOENT; goto finished; } dd = vp->v_data; if (vp->v_type == VDIR && dd == dmp->dm_rootdir) { *dvp = vp; vref(*dvp); goto finished; } i = *buflen; i -= dd->de_dirent->d_namlen; if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen); *buflen = i; de = devfs_parent_dirent(dd); if (de == NULL) { error = ENOENT; goto finished; } mtx_lock(&devfs_de_interlock); *dvp = de->de_vnode; if (*dvp != NULL) { VI_LOCK(*dvp); mtx_unlock(&devfs_de_interlock); vholdl(*dvp); VI_UNLOCK(*dvp); vref(*dvp); vdrop(*dvp); } else { mtx_unlock(&devfs_de_interlock); error = ENOENT; } finished: sx_xunlock(&dmp->dm_lock); return (error); } /* * Construct the fully qualified path name relative to the mountpoint. * If a NULL cnp is provided, no '/' is appended to the resulting path. */ char * devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd, struct componentname *cnp) { int i; struct devfs_dirent *de; sx_assert(&dmp->dm_lock, SA_LOCKED); i = SPECNAMELEN; buf[i] = '\0'; if (cnp != NULL) i -= cnp->cn_namelen; if (i < 0) return (NULL); if (cnp != NULL) bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_rootdir) { if (cnp != NULL || i < SPECNAMELEN) { i--; if (i < 0) return (NULL); buf[i] = '/'; } i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = devfs_parent_dirent(de); if (de == NULL) return (NULL); } return (buf + i); } static int devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp, struct devfs_dirent *de) { int not_found; not_found = 0; if (de->de_flags & DE_DOOMED) not_found = 1; if (DEVFS_DE_DROP(de)) { KASSERT(not_found == 1, ("DEVFS de dropped but not doomed")); devfs_dirent_free(de); } if (DEVFS_DMP_DROP(dmp)) { KASSERT(not_found == 1, ("DEVFS mount struct freed before dirent")); not_found = 2; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } if (not_found == 1 || (drop_dm_lock && not_found != 2)) sx_unlock(&dmp->dm_lock); return (not_found); } /* * devfs_allocv shall be entered with dmp->dm_lock held, and it drops * it on return. */ int devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode, struct vnode **vpp) { int error; struct vnode *vp; struct cdev *dev; struct devfs_mount *dmp; struct cdevsw *dsw; enum vgetstate vs; dmp = VFSTODEVFS(mp); if (de->de_flags & DE_DOOMED) { sx_xunlock(&dmp->dm_lock); return (ENOENT); } loop: DEVFS_DE_HOLD(de); DEVFS_DMP_HOLD(dmp); mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { vs = vget_prep(vp); mtx_unlock(&devfs_de_interlock); sx_xunlock(&dmp->dm_lock); vget_finish(vp, lockmode | LK_RETRY, vs); sx_xlock(&dmp->dm_lock); if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } else if (VN_IS_DOOMED(vp)) { mtx_lock(&devfs_de_interlock); if (de->de_vnode == vp) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vput(vp); goto loop; } sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } mtx_unlock(&devfs_de_interlock); if (de->de_dirent->d_type == DT_CHR) { if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) { devfs_allocv_drop_refs(1, dmp, de); return (ENOENT); } dev = &de->de_cdp->cdp_c; } else { dev = NULL; } error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp); if (error != 0) { devfs_allocv_drop_refs(1, dmp, de); printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; VI_LOCK(vp); dev_lock(); dev_refl(dev); /* XXX: v_rdev should be protect by vnode lock */ vp->v_rdev = dev; VNPASS(vp->v_usecount == 1, vp); /* Special casing of ttys for deadfs. Probably redundant. */ dsw = dev->si_devsw; if (dsw != NULL && (dsw->d_flags & D_TTY) != 0) vp->v_vflag |= VV_ISTTY; dev_unlock(); VI_UNLOCK(vp); if ((dev->si_flags & SI_ETERNAL) != 0) vp->v_vflag |= VV_ETERNALDEV; vp->v_op = &devfs_specops; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS); VN_LOCK_ASHARE(vp); mtx_lock(&devfs_de_interlock); vp->v_data = de; de->de_vnode = vp; mtx_unlock(&devfs_de_interlock); error = insmntque1(vp, mp); if (error != 0) { mtx_lock(&devfs_de_interlock); vp->v_data = NULL; de->de_vnode = NULL; mtx_unlock(&devfs_de_interlock); vgone(vp); vput(vp); (void) devfs_allocv_drop_refs(1, dmp, de); return (error); } if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } #ifdef MAC mac_devfs_vnode_associate(mp, de, vp); #endif sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } static int devfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct proc *p; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_accmode, ap->a_cred); if (error == 0) return (0); if (error != EACCES) return (error); p = ap->a_td->td_proc; /* We do, however, allow access to the controlling terminal */ PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == de->de_cdp) error = 0; PROC_UNLOCK(p); return (error); } _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0, "devfs-only flag reuse failed"); static int devfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct proc *p; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; struct devfs_dirent *de = vp->v_data; int dflags, error, ref, vp_locked; /* * XXX: Don't call d_close() if we were called because of * XXX: insmntque() failure. */ if (vp->v_data == NULL) return (0); /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (de->de_usecount == 2 && td != NULL) { p = td->td_proc; PROC_LOCK(p); if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; sx_xlock(&proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); if (devfs_usecountl(vp) == 2 && !VN_IS_DOOMED(vp)) { p->p_session->s_ttyvp = NULL; p->p_session->s_ttydp = NULL; oldvp = vp; } VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); SESS_UNLOCK(p->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) devfs_ctty_unref(oldvp); } else PROC_UNLOCK(p); } /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); dflags = 0; mtx_lock(&devfs_de_interlock); VI_LOCK(vp); if (devfs_usecountl(vp) == 1) dflags |= FLASTCLOSE; devfs_usecount_subl(vp); mtx_unlock(&devfs_de_interlock); if (VN_IS_DOOMED(vp)) { /* Forced close. */ dflags |= FREVOKE | FNONBLOCK; } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if ((dflags & FLASTCLOSE) == 0) { VI_UNLOCK(vp); dev_relthread(dev, ref); return (0); } vholdnz(vp); VI_UNLOCK(vp); vp_locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td); dev_relthread(dev, ref); vn_lock(vp, vp_locked | LK_RETRY); vdrop(vp); return (error); } static int devfs_close_f(struct file *fp, struct thread *td) { int error; struct file *fpop; /* * NB: td may be NULL if this descriptor is closed due to * garbage collection from a closed UNIX domain socket. */ fpop = curthread->td_fpop; curthread->td_fpop = fp; error = vnops.fo_close(fp, td); curthread->td_fpop = fpop; /* * The f_cdevpriv cannot be assigned non-NULL value while we * are destroying the file. */ if (fp->f_cdevpriv != NULL) devfs_fpdrop(fp); return (error); } static int devfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct devfs_dirent *de; struct devfs_mount *dmp; struct cdev *dev; struct timeval boottime; int error; error = devfs_populate_vp(vp); if (error != 0) return (error); dmp = VFSTODEVFS(vp->v_mount); sx_xunlock(&dmp->dm_lock); de = vp->v_data; KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp)); if (vp->v_type == VDIR) { de = de->de_dir; KASSERT(de != NULL, ("Null dir dirent in devfs_getattr vp=%p", vp)); } vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; getboottime(&boottime); #define fix(aa) \ do { \ if ((aa).tv_sec <= 3600) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = cdev2priv(dev)->cdp_inode; } vap->va_gen = 0; vap->va_flags = 0; vap->va_filerev = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* ARGSUSED */ static int devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct file *fpop; int error; fpop = td->td_fpop; td->td_fpop = fp; error = vnops.fo_ioctl(fp, com, data, cred, td); td->td_fpop = fpop; return (error); } void * fiodgname_buf_get_ptr(void *fgnp, u_long com) { union { struct fiodgname_arg fgn; #ifdef COMPAT_FREEBSD32 struct fiodgname_arg32 fgn32; #endif } *fgnup; fgnup = fgnp; switch (com) { case FIODGNAME: return (fgnup->fgn.buf); #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: return ((void *)(uintptr_t)fgnup->fgn32.buf); #endif default: panic("Unhandled ioctl command %ld", com); } } static int devfs_ioctl(struct vop_ioctl_args *ap) { struct fiodgname_arg *fgn; struct vnode *vpold, *vp; struct cdevsw *dsw; struct thread *td; struct session *sess; struct cdev *dev; int error, ref, i; const char *p; u_long com; vp = ap->a_vp; com = ap->a_command; td = ap->a_td; dsw = devvn_refthread(vp, &dev, &ref); if (dsw == NULL) return (ENXIO); KASSERT(dev->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(dev))); switch (com) { case FIODTYPE: *(int *)ap->a_data = dsw->d_flags & D_TYPEMASK; error = 0; break; case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif fgn = ap->a_data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i); break; default: error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); } dev_relthread(dev, ref); if (error == ENOIOCTL) error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { /* * Do nothing if reassigning same control tty, or if the * control tty has already disappeared. If it disappeared, * it's because we were racing with TIOCNOTTY. TIOCNOTTY * already took care of releasing the old vnode and we have * nothing left to do. */ sx_slock(&proctree_lock); sess = td->td_proc->p_session; if (sess->s_ttyvp == vp || sess->s_ttyp == NULL) { sx_sunlock(&proctree_lock); return (0); } devfs_ctty_ref(vp); SESS_LOCK(sess); vpold = sess->s_ttyvp; sess->s_ttyvp = vp; sess->s_ttydp = cdev2priv(dev); SESS_UNLOCK(sess); sx_sunlock(&proctree_lock); /* Get rid of reference to old control tty */ if (vpold) devfs_ctty_unref(vpold); } return (error); } /* ARGSUSED */ static int devfs_kqfilter_f(struct file *fp, struct knote *kn) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; struct thread *td; td = curthread; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error) return (error); error = dsw->d_kqfilter(dev, kn); td->td_fpop = fpop; dev_relthread(dev, ref); return (error); } static inline int devfs_prison_check(struct devfs_dirent *de, struct thread *td) { struct cdev_priv *cdp; struct ucred *dcr; struct proc *p; int error; cdp = de->de_cdp; if (cdp == NULL) return (0); dcr = cdp->cdp_c.si_cred; if (dcr == NULL) return (0); error = prison_check(td->td_ucred, dcr); if (error == 0) return (0); /* We do, however, allow access to the controlling terminal */ p = td->td_proc; PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == cdp) error = 0; PROC_UNLOCK(p); return (error); } static int devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct mount *mp; struct cdev *cdev; int error, flags, nameiop, dvplocked; char specname[SPECNAMELEN + 1], *pname; td = curthread; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; mp = dvp->v_mount; dmp = VFSTODEVFS(mp); dd = dvp->v_data; *vpp = NULLVP; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); de = devfs_parent_dirent(dd); if (de == NULL) return (ENOENT); dvplocked = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; vn_lock(dvp, dvplocked | LK_RETRY); return (error); } dd = dvp->v_data; de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0); while (de == NULL) { /* While(...) so we can use break */ if (nameiop == DELETE) return (ENOENT); /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dmp, dd, cnp); if (pname == NULL) break; cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); if (cdev == NULL) sx_xlock(&dmp->dm_lock); else if (devfs_populate_vp(dvp) != 0) { *dm_unlock = 0; sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } else sx_xunlock(&dmp->dm_lock); dev_rel(cdev); return (ENOENT); } if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); if (cdev != NULL) dev_rel(cdev); return (ENOENT); } if (cdev == NULL) break; dev_lock(); dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx]; if (dde != NULL && *dde != NULL) de = *dde; dev_unlock(); dev_rel(cdev); break; } if (de == NULL || de->de_flags & DE_WHITEOUT) { if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { - cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } if (devfs_prison_check(de, td)) return (ENOENT); if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; return (error); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; int dm_unlock; if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOTDIR); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dm_unlock = 1; j = devfs_lookupx(ap, &dm_unlock); if (dm_unlock == 1) sx_xunlock(&dmp->dm_lock); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; /* * The only type of node we should be creating here is a * character device, for anything else return EOPNOTSUPP. */ if (ap->a_vap->va_type != VCHR) return (EOPNOTSUPP); dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); cnp = ap->a_cnp; vpp = ap->a_vpp; dd = dvp->v_data; error = ENOENT; sx_xlock(&dmp->dm_lock); TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (de->de_dirent->d_type == DT_CHR && (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp); return (error); notfound: sx_xunlock(&dmp->dm_lock); return (error); } /* ARGSUSED */ static int devfs_open(struct vop_open_args *ap) { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; struct file *fp = ap->a_fp; int error, ref, vlocked; struct cdevsw *dsw; struct file *fpop; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); if (fp == NULL && dsw->d_fdopen != NULL) { dev_relthread(dev, ref); return (ENXIO); } if (vp->v_type == VCHR) devfs_usecount_add(vp); vlocked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); fpop = td->td_fpop; td->td_fpop = fp; if (fp != NULL) { fp->f_data = dev; fp->f_vnode = vp; } if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); /* Clean up any cdevpriv upon error. */ if (error != 0) devfs_clear_cdevpriv(); td->td_fpop = fpop; vn_lock(vp, vlocked | LK_RETRY); if (error != 0 && vp->v_type == VCHR) devfs_usecount_sub(vp); dev_relthread(dev, ref); if (error != 0) { if (error == ERESTART) error = EINTR; return (error); } #if 0 /* /dev/console */ KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp")); #else if (fp == NULL) return (error); #endif if (fp->f_ops == &badfileops) finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f); return (error); } static int devfs_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = INT_MAX; return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_MAX_CANON: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_CANON; return (0); } return (EINVAL); case _PC_MAX_INPUT: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_INPUT; return (0); } return (EINVAL); case _PC_VDISABLE: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = _POSIX_VDISABLE; return (0); } return (EINVAL); case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistent label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_poll(fp, events, cred, td); return (error); } error = dsw->d_poll(dev, events, td); td->td_fpop = fpop; dev_relthread(dev, ref); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(struct vop_print_args *ap) { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } static int devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int ioflag, error, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_read(fp, uio, cred, flags, td); return (error); } resid = uio->uio_resid; ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); error = dsw->d_read(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) devfs_timestamp(&dev->si_atime); td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_R); return (error); } static int devfs_readdir(struct vop_readdir_args *ap) { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off; int *tmp_ncookies = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * XXX: This is a temporary hack to get around this filesystem not * supporting cookies. We store the location of the ncookies pointer * in a temporary variable before calling vfs_subr.c:vfs_read_dirent() * and set the number of cookies to 0. We then set the pointer to * NULL so that vfs_read_dirent doesn't try to call realloc() on * ap->a_cookies. Later in this function, we restore the ap->a_ncookies * pointer to its original location before returning to the caller. */ if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } dmp = VFSTODEVFS(ap->a_vp->v_mount); if (devfs_populate_vp(ap->a_vp) != 0) { if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (EIO); } error = 0; de = ap->a_vp->v_data; off = 0; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); if (dd->de_flags & (DE_COVERED | DE_WHITEOUT)) continue; if (devfs_prison_check(dd, uio->uio_td)) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp)); if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; /* NOTE: d_off is the offset for the *next* entry. */ dp->d_off = off + dp->d_reclen; if (off >= uio->uio_offset) { error = vfs_read_dirent(ap, dp, off); if (error) break; } off += dp->d_reclen; } sx_xunlock(&dmp->dm_lock); uio->uio_offset = off; /* * Restore ap->a_ncookies if it wasn't originally NULL in the first * place. */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } static int devfs_readlink(struct vop_readlink_args *ap) { struct devfs_dirent *de; de = ap->a_vp->v_data; return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio)); } static void devfs_reclaiml(struct vnode *vp) { struct devfs_dirent *de; mtx_assert(&devfs_de_interlock, MA_OWNED); de = vp->v_data; if (de != NULL) { MPASS(de->de_usecount == 0); de->de_vnode = NULL; vp->v_data = NULL; } } static int devfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; vp = ap->a_vp; mtx_lock(&devfs_de_interlock); devfs_reclaiml(vp); mtx_unlock(&devfs_de_interlock); return (0); } static int devfs_reclaim_vchr(struct vop_reclaim_args *ap) { struct vnode *vp; struct cdev *dev; vp = ap->a_vp; MPASS(vp->v_type == VCHR); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); devfs_usecount_subl(vp); devfs_reclaiml(vp); mtx_unlock(&devfs_de_interlock); dev_lock(); dev = vp->v_rdev; vp->v_rdev = NULL; dev_unlock(); VI_UNLOCK(vp); if (dev != NULL) dev_rel(dev); return (0); } static int devfs_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); ASSERT_VOP_ELOCKED(dvp, "devfs_remove"); ASSERT_VOP_ELOCKED(vp, "devfs_remove"); sx_xlock(&dmp->dm_lock); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_cdp == NULL) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_dirent->d_type == DT_LNK) { de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) de_covered->de_flags &= ~DE_COVERED; } /* We need to unlock dvp because devfs_delete() may lock it. */ VOP_UNLOCK(vp); if (dvp != vp) VOP_UNLOCK(dvp); devfs_delete(dmp, de, 0); sx_xunlock(&dmp->dm_lock); if (dvp != vp) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { de->de_flags |= DE_WHITEOUT; sx_xunlock(&dmp->dm_lock); } return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. * */ static int devfs_revoke(struct vop_revoke_args *ap) { struct vnode *vp = ap->a_vp, *vp2; struct cdev *dev; struct cdev_priv *cdp; struct devfs_dirent *de; enum vgetstate vs; u_int i; KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL")); dev = vp->v_rdev; cdp = cdev2priv(dev); dev_lock(); cdp->cdp_inuse++; dev_unlock(); vhold(vp); vgone(vp); vdrop(vp); VOP_UNLOCK(vp); loop: for (;;) { mtx_lock(&devfs_de_interlock); dev_lock(); vp2 = NULL; for (i = 0; i <= cdp->cdp_maxdirent; i++) { de = cdp->cdp_dirents[i]; if (de == NULL) continue; vp2 = de->de_vnode; if (vp2 != NULL) { dev_unlock(); vs = vget_prep(vp2); mtx_unlock(&devfs_de_interlock); if (vget_finish(vp2, LK_EXCLUSIVE, vs) != 0) goto loop; vhold(vp2); vgone(vp2); vdrop(vp2); vput(vp2); break; } } if (vp2 != NULL) { continue; } dev_unlock(); mtx_unlock(&devfs_de_interlock); break; } dev_lock(); cdp->cdp_inuse--; if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) { TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); } else dev_unlock(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (0); } static int devfs_rioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct devfs_mount *dmp; int error; vp = ap->a_vp; vn_lock(vp, LK_SHARED | LK_RETRY); if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); return (EBADF); } dmp = VFSTODEVFS(vp->v_mount); sx_xlock(&dmp->dm_lock); VOP_UNLOCK(vp); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td); sx_xunlock(&dmp->dm_lock); return (error); } static int devfs_rread(struct vop_read_args *ap) { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(struct vop_setattr_args *ap) { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; td = curthread; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } error = devfs_populate_vp(vp); if (error != 0) return (error); de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) { error = priv_check(td, PRIV_VFS_CHOWN); if (error != 0) goto ret; } de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != de->de_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto ret; } de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { error = vn_utimes_perm(vp, vap, ap->a_cred, td); if (error != 0) goto ret; if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } ret: sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock); return (error); } #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_vnode_relabel(ap->a_cred, vp, ap->a_label); mac_devfs_update(vp->v_mount, de, vp); return (0); } #endif static int devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred) { return (vnops.fo_stat(fp, sb, cred)); } static int devfs_symlink(struct vop_symlink_args *ap) { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered, *de_dotdot; struct devfs_mount *dmp; error = priv_check(curthread, PRIV_DEVFS_SYMLINK); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOENT); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_flags = DE_USER; de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = alloc_unr(devfs_inos); de->de_dir = dd; de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; de->de_symlink = malloc(i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); #ifdef MAC mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) { if ((de_covered->de_flags & DE_USER) != 0) { devfs_delete(dmp, de, DEVFS_DEL_NORECURSE); sx_xunlock(&dmp->dm_lock); return (EEXIST); } KASSERT((de_covered->de_flags & DE_COVERED) == 0, ("devfs_symlink: entry %p already covered", de_covered)); de_covered->de_flags |= DE_COVERED; } de_dotdot = TAILQ_FIRST(&dd->de_dlist); /* "." */ de_dotdot = TAILQ_NEXT(de_dotdot, de_list); /* ".." */ TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list); devfs_dir_ref_de(dmp, dd); devfs_rules_apply(dmp, de); return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp)); } static int devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { return (vnops.fo_truncate(fp, length, cred, td)); } static int devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int error, ioflag, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_write(fp, uio, cred, flags, td); return (error); } KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); resid = uio->uio_resid; error = dsw->d_write(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { devfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_W); return (error); } static int devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; struct mount *mp; struct vnode *vp; struct file *fpop; vm_object_t object; vm_prot_t maxprot; int error, ref; vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. * * Note that most character devices always share mappings. * The one exception is that D_MMAP_ANON devices * (i.e. /dev/zero) permit private writable mappings. * * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests * as well as updating maxprot to permit writing for * D_MMAP_ANON devices rather than doing that here. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } maxprot &= cap_maxprot; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) return (error); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff, &object); td->td_fpop = fpop; dev_relthread(dev, ref); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); return (error); } dev_t dev2udev(struct cdev *x) { if (x == NULL) return (NODEV); return (cdev2priv(x)->cdp_inode); } static struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_truncate = devfs_truncate_f, .fo_ioctl = devfs_ioctl_f, .fo_poll = devfs_poll_f, .fo_kqfilter = devfs_kqfilter_f, .fo_stat = devfs_stat_f, .fo_close = devfs_close_f, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = devfs_mmap_f, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; /* Vops for non-CHR vnodes in /dev. */ static struct vop_vector devfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_rioctl, .vop_lookup = devfs_lookup, .vop_mknod = devfs_mknod, .vop_pathconf = devfs_pathconf, .vop_read = devfs_rread, .vop_readdir = devfs_readdir, .vop_readlink = devfs_readlink, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_revoke = devfs_revoke, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_symlink = devfs_symlink, .vop_vptocnp = devfs_vptocnp, .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(devfs_vnodeops); /* Vops for VCHR vnodes in /dev. */ static struct vop_vector devfs_specops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_bmap = VOP_PANIC, .vop_close = devfs_close, .vop_create = VOP_PANIC, .vop_fsync = vop_stdfsync, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_ioctl, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = devfs_open, .vop_pathconf = devfs_pathconf, .vop_poll = dead_poll, .vop_print = devfs_print, .vop_read = dead_read, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = devfs_reclaim_vchr, .vop_remove = devfs_remove, .vop_rename = VOP_PANIC, .vop_revoke = devfs_revoke, .vop_rmdir = VOP_PANIC, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_strategy = VOP_PANIC, .vop_symlink = VOP_PANIC, .vop_vptocnp = devfs_vptocnp, .vop_write = dead_write, .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(devfs_specops); /* * Our calling convention to the device drivers used to be that we passed * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ * flags instead since that's what open(), close() and ioctl() takes and * we don't really want vnode.h in device drivers. * We solved the source compatibility by redefining some vnode flags to * be the same as the fcntl ones and by sending down the bitwise OR of * the respective fcntl/vnode flags. These CTASSERTS make sure nobody * pulls the rug out under this. */ CTASSERT(O_NONBLOCK == IO_NDELAY); CTASSERT(O_FSYNC == IO_SYNC); diff --git a/sys/fs/ext2fs/ext2_lookup.c b/sys/fs/ext2fs/ext2_lookup.c index a8e28e24f9e2..16f2aa88b28c 100644 --- a/sys/fs/ext2fs/ext2_lookup.c +++ b/sys/fs/ext2fs/ext2_lookup.c @@ -1,1275 +1,1267 @@ /*- * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_lookup.c 8.6 (Berkeley) 4/1/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DECLARE(ext2fs); /* * ext2fs trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(ext2fs, , lookup, trace, "int", "char*"); SDT_PROBE_DEFINE4(ext2fs, , trace, ext2_dirbad_error, "char*", "ino_t", "doff_t", "char*"); SDT_PROBE_DEFINE5(ext2fs, , trace, ext2_dirbadentry_error, "char*", "int", "uint32_t", "uint16_t", "uint8_t"); static SYSCTL_NODE(_vfs, OID_AUTO, e2fs, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "EXT2FS filesystem"); /* DIRBLKSIZE in ffs is DEV_BSIZE (in most cases 512) while it is the native blocksize in ext2fs - thus, a #define is no longer appropriate */ #undef DIRBLKSIZ static u_char ext2_ft_to_dt[] = { DT_UNKNOWN, /* EXT2_FT_UNKNOWN */ DT_REG, /* EXT2_FT_REG_FILE */ DT_DIR, /* EXT2_FT_DIR */ DT_CHR, /* EXT2_FT_CHRDEV */ DT_BLK, /* EXT2_FT_BLKDEV */ DT_FIFO, /* EXT2_FT_FIFO */ DT_SOCK, /* EXT2_FT_SOCK */ DT_LNK, /* EXT2_FT_SYMLINK */ }; #define FTTODT(ft) \ ((ft) < nitems(ext2_ft_to_dt) ? ext2_ft_to_dt[(ft)] : DT_UNKNOWN) static u_char dt_to_ext2_ft[] = { EXT2_FT_UNKNOWN, /* DT_UNKNOWN */ EXT2_FT_FIFO, /* DT_FIFO */ EXT2_FT_CHRDEV, /* DT_CHR */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_DIR, /* DT_DIR */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_BLKDEV, /* DT_BLK */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_REG_FILE, /* DT_REG */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_SYMLINK, /* DT_LNK */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_SOCK, /* DT_SOCK */ EXT2_FT_UNKNOWN, /* unused */ EXT2_FT_UNKNOWN, /* DT_WHT */ }; #define DTTOFT(dt) \ ((dt) < nitems(dt_to_ext2_ft) ? dt_to_ext2_ft[(dt)] : EXT2_FT_UNKNOWN) static int ext2_check_direntry(struct vnode *dp, struct ext2fs_direct_2 *de, int entryoffsetinblock); static int ext2_is_dot_entry(struct componentname *cnp); static int ext2_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, ino_t *dd_ino); static int ext2_is_dot_entry(struct componentname *cnp) { if (cnp->cn_namelen <= 2 && cnp->cn_nameptr[0] == '.' && (cnp->cn_nameptr[1] == '.' || cnp->cn_nameptr[1] == '\0')) return (1); return (0); } /* * Vnode op for reading directories. */ int ext2_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct buf *bp; struct inode *ip; struct ext2fs_direct_2 *dp, *edp; uint64_t *cookies; struct dirent dstdp; off_t offset, startoffset; size_t readcnt, skipcnt; ssize_t startresid; u_int ncookies; int DIRBLKSIZ = VTOI(ap->a_vp)->i_e2fs->e2fs_bsize; int error; if (uio->uio_offset < 0) return (EINVAL); ip = VTOI(vp); if (ap->a_ncookies != NULL) { if (uio->uio_resid < 0) ncookies = 0; else ncookies = uio->uio_resid; if (uio->uio_offset >= ip->i_size) ncookies = 0; else if (ip->i_size - uio->uio_offset < ncookies) ncookies = ip->i_size - uio->uio_offset; ncookies = ncookies / (offsetof(struct ext2fs_direct_2, e2d_namlen) + 4) + 1; cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } else { ncookies = 0; cookies = NULL; } offset = startoffset = uio->uio_offset; startresid = uio->uio_resid; error = 0; while (error == 0 && uio->uio_resid > 0 && uio->uio_offset < ip->i_size) { error = ext2_blkatoff(vp, uio->uio_offset, NULL, &bp); if (error) break; if (bp->b_offset + bp->b_bcount > ip->i_size) readcnt = ip->i_size - bp->b_offset; else readcnt = bp->b_bcount; skipcnt = (size_t)(uio->uio_offset - bp->b_offset) & ~(size_t)(DIRBLKSIZ - 1); offset = bp->b_offset + skipcnt; dp = (struct ext2fs_direct_2 *)&bp->b_data[skipcnt]; edp = (struct ext2fs_direct_2 *)&bp->b_data[readcnt]; while (error == 0 && uio->uio_resid > 0 && dp < edp) { if (le16toh(dp->e2d_reclen) <= offsetof(struct ext2fs_direct_2, e2d_namlen) || (caddr_t)dp + le16toh(dp->e2d_reclen) > (caddr_t)edp) { error = EIO; break; } /*- * "New" ext2fs directory entries differ in 3 ways * from ufs on-disk ones: * - the name is not necessarily NUL-terminated. * - the file type field always exists and always * follows the name length field. * - the file type is encoded in a different way. * * "Old" ext2fs directory entries need no special * conversions, since they are binary compatible * with "new" entries having a file type of 0 (i.e., * EXT2_FT_UNKNOWN). Splitting the old name length * field didn't make a mess like it did in ufs, * because ext2fs uses a machine-independent disk * layout. */ dstdp.d_namlen = dp->e2d_namlen; dstdp.d_type = FTTODT(dp->e2d_type); if (offsetof(struct ext2fs_direct_2, e2d_namlen) + dstdp.d_namlen > le16toh(dp->e2d_reclen)) { error = EIO; break; } if (offset < startoffset || le32toh(dp->e2d_ino) == 0) goto nextentry; dstdp.d_fileno = le32toh(dp->e2d_ino); dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp); bcopy(dp->e2d_name, dstdp.d_name, dstdp.d_namlen); /* NOTE: d_off is the offset of the *next* entry. */ dstdp.d_off = offset + le16toh(dp->e2d_reclen); dirent_terminate(&dstdp); if (dstdp.d_reclen > uio->uio_resid) { if (uio->uio_resid == startresid) error = EINVAL; else error = EJUSTRETURN; break; } /* Advance dp. */ error = uiomove((caddr_t)&dstdp, dstdp.d_reclen, uio); if (error) break; if (cookies != NULL) { KASSERT(ncookies > 0, ("ext2_readdir: cookies buffer too small")); *cookies = offset + le16toh(dp->e2d_reclen); cookies++; ncookies--; } nextentry: offset += le16toh(dp->e2d_reclen); dp = (struct ext2fs_direct_2 *)((caddr_t)dp + le16toh(dp->e2d_reclen)); } bqrelse(bp); uio->uio_offset = offset; } /* We need to correct uio_offset. */ uio->uio_offset = offset; if (error == EJUSTRETURN) error = 0; if (ap->a_ncookies != NULL) { if (error == 0) { ap->a_ncookies -= ncookies; } else { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } } if (error == 0 && ap->a_eofflag) *ap->a_eofflag = ip->i_size <= uio->uio_offset; return (error); } /* * Convert a component of a pathname into a pointer to a locked inode. * This is a very central and rather complicated routine. * If the file system is not maintained in a strict tree hierarchy, * this can result in a deadlock situation (see comments in code below). * * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending * on whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it and the target of the pathname * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may * be "."., but the caller must check to ensure it does an vrele and vput * instead of two vputs. * * Overall outline of ext2_lookup: * * search for name in directory, to found or notfound * notfound: * if creating, return locked directory, leaving info on available slots * else return error * found: * if at end of path and deleting, return information to allow delete * if at end of path and rewriting (RENAME and LOCKPARENT), lock target * inode and return info to allow rewrite * if not at end, add name to cache; if at end and neither creating * nor deleting, add name to cache */ int ext2_lookup(struct vop_cachedlookup_args *ap) { return (ext2_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); } static int ext2_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, ino_t *dd_ino) { struct inode *dp; /* inode for directory being searched */ struct buf *bp; /* a buffer of directory entries */ struct ext2fs_direct_2 *ep; /* the current directory entry */ int entryoffsetinblock; /* offset of ep in bp's buffer */ struct ext2fs_searchslot ss; doff_t i_diroff; /* cached i_diroff value */ doff_t i_offset; /* cached i_offset value */ int numdirpasses; /* strategy for directory search */ doff_t endsearch; /* offset to end directory search */ doff_t prevoff; /* prev entry dp->i_offset */ struct vnode *pdp; /* saved dp during symlink work */ struct vnode *tdp; /* returned by VFS_VGET */ doff_t enduseful; /* pointer past last used dir slot */ u_long bmask; /* block offset mask */ int error; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; ino_t ino, ino1; int ltype; int entry_found = 0; int DIRBLKSIZ = VTOI(vdp)->i_e2fs->e2fs_bsize; if (vpp != NULL) *vpp = NULL; dp = VTOI(vdp); bmask = VFSTOEXT2(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; restart: bp = NULL; ss.slotoffset = -1; /* * We now have a segment name to search for, and a directory to search. * * Suppress search for slots unless creating * file and at end of pathname, in which case * we watch for a place to put the new file in * case it doesn't already exist. */ i_diroff = dp->i_diroff; ss.slotstatus = FOUND; ss.slotfreespace = ss.slotsize = ss.slotneeded = 0; if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) { ss.slotstatus = NONE; ss.slotneeded = EXT2_DIR_REC_LEN(cnp->cn_namelen); /* * was ss.slotneeded = (sizeof(struct direct) - MAXNAMLEN + * cnp->cn_namelen + 3) &~ 3; */ } /* * Try to lookup dir entry using htree directory index. * * If we got an error or we want to find '.' or '..' entry, * we will fall back to linear search. */ if (!ext2_is_dot_entry(cnp) && ext2_htree_has_idx(dp)) { numdirpasses = 1; entryoffsetinblock = 0; switch (ext2_htree_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen, &bp, &entryoffsetinblock, &i_offset, &prevoff, &enduseful, &ss)) { case 0: ep = (struct ext2fs_direct_2 *)((char *)bp->b_data + (i_offset & bmask)); goto foundentry; case ENOENT: i_offset = roundup2(dp->i_size, DIRBLKSIZ); goto notfound; default: /* * Something failed; just fallback to do a linear * search. */ break; } } /* * If there is cached information on a previous search of * this directory, pick up where we last left off. * We cache only lookups as these are the most common * and have the greatest payoff. Caching CREATE has little * benefit as it usually must search the entire directory * to determine that the entry does not exist. Caching the * location of the last DELETE or RENAME has not reduced * profiling time and hence has been removed in the interest * of simplicity. */ if (nameiop != LOOKUP || i_diroff == 0 || i_diroff > dp->i_size) { entryoffsetinblock = 0; i_offset = 0; numdirpasses = 1; } else { i_offset = i_diroff; if ((entryoffsetinblock = i_offset & bmask) && (error = ext2_blkatoff(vdp, (off_t)i_offset, NULL, &bp))) return (error); numdirpasses = 2; nchstats.ncs_2passes++; } prevoff = i_offset; endsearch = roundup2(dp->i_size, DIRBLKSIZ); enduseful = 0; searchloop: while (i_offset < endsearch) { /* * If necessary, get the next directory block. */ if (bp != NULL) brelse(bp); error = ext2_blkatoff(vdp, (off_t)i_offset, NULL, &bp); if (error != 0) return (error); entryoffsetinblock = 0; if (ss.slotstatus == NONE) { ss.slotoffset = -1; ss.slotfreespace = 0; } error = ext2_search_dirblock(dp, bp->b_data, &entry_found, cnp->cn_nameptr, cnp->cn_namelen, &entryoffsetinblock, &i_offset, &prevoff, &enduseful, &ss); if (error != 0) { brelse(bp); return (error); } if (entry_found) { ep = (struct ext2fs_direct_2 *)((char *)bp->b_data + (entryoffsetinblock & bmask)); foundentry: ino = le32toh(ep->e2d_ino); goto found; } } notfound: /* * If we started in the middle of the directory and failed * to find our target, we must check the beginning as well. */ if (numdirpasses == 2) { numdirpasses--; i_offset = 0; endsearch = i_diroff; goto searchloop; } if (bp != NULL) brelse(bp); /* * If creating, and at end of pathname and current * directory has not been removed, then can consider * allowing file to be created. */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN) && dp->i_nlink != 0) { /* * Access for write is interpreted as allowing * creation of files in the directory. */ if ((error = VOP_ACCESS(vdp, VWRITE, cred, curthread)) != 0) return (error); /* * Return an indication of where the new directory * entry should be put. If we didn't find a slot, * then set dp->i_count to 0 indicating * that the new slot belongs at the end of the * directory. If we found a slot, then the new entry * can be put in the range from dp->i_offset to * dp->i_offset + dp->i_count. */ if (ss.slotstatus == NONE) { dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ); dp->i_count = 0; enduseful = dp->i_offset; } else { dp->i_offset = ss.slotoffset; dp->i_count = ss.slotsize; if (enduseful < ss.slotoffset + ss.slotsize) enduseful = ss.slotoffset + ss.slotsize; } dp->i_endoff = roundup2(enduseful, DIRBLKSIZ); /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to do a direnter(). * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. - * The pathname buffer is saved so that the name - * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ - cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } /* * Insert name into cache (as non-existent) if appropriate. */ if ((cnp->cn_flags & MAKEENTRY) != 0) cache_enter(vdp, NULL, cnp); return (ENOENT); found: if (dd_ino != NULL) *dd_ino = ino; if (numdirpasses == 2) nchstats.ncs_pass2++; /* * Check that directory length properly reflects presence * of this entry. */ if (entryoffsetinblock + EXT2_DIR_REC_LEN(ep->e2d_namlen) > dp->i_size) { ext2_dirbad(dp, i_offset, "i_size too small"); dp->i_size = entryoffsetinblock + EXT2_DIR_REC_LEN(ep->e2d_namlen); dp->i_flag |= IN_CHANGE | IN_UPDATE; } brelse(bp); /* * Found component in pathname. * If the final component of path name, save information * in the cache as to where the entry was found. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = rounddown2(i_offset, DIRBLKSIZ); /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. */ if (nameiop == DELETE && (flags & ISLASTCN)) { if (flags & LOCKPARENT) ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); /* * Write access to directory required to delete files. */ if ((error = VOP_ACCESS(vdp, VWRITE, cred, curthread)) != 0) return (error); /* * Return pointer to current entry in dp->i_offset, * and distance past previous entry (if there * is a previous entry in this block) in dp->i_count. * Save directory inode pointer in ndp->ni_dvp for dirremove(). * * Technically we shouldn't be setting these in the * WANTPARENT case (first lookup in rename()), but any * lookups that will result in directory changes will * overwrite these. */ dp->i_offset = i_offset; if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) dp->i_count = 0; else dp->i_count = dp->i_offset - prevoff; if (dd_ino != NULL) return (0); if (dp->i_number == ino) { VREF(vdp); *vpp = vdp; return (0); } if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); /* * If directory is "sticky", then user must own * the directory, or the file in it, else she * may not delete it (unless she's root). This * implements append-only directories. */ if ((dp->i_mode & ISVTX) && cred->cr_uid != 0 && cred->cr_uid != dp->i_uid && VTOI(tdp)->i_uid != cred->cr_uid) { vput(tdp); return (EPERM); } *vpp = tdp; return (0); } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && (flags & ISLASTCN)) { if ((error = VOP_ACCESS(vdp, VWRITE, cred, curthread)) != 0) return (error); /* * Careful about locking second inode. * This can only occur if the target is ".". */ dp->i_offset = i_offset; if (dp->i_number == ino) return (EISDIR); if (dd_ino != NULL) return (0); if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); *vpp = tdp; - cnp->cn_flags |= SAVENAME; return (0); } if (dd_ino != NULL) return (0); /* * Step through the translation in the name. We do not `vput' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking * the directory to insure that the inode will not be removed * before we get it. We prevent deadlock by always fetching * inodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. * There is a potential race condition here if both the current * and parent directories are removed before the VFS_VGET for the * inode associated with ".." returns. We hope that this occurs * infrequently since we cannot avoid this race condition without * implementing a sophisticated deadlock detection algorithm. * Note also that this simple deadlock detection scheme will not * work if the file system has any hard links other than ".." * that point backwards in the directory structure. */ pdp = vdp; if (flags & ISDOTDOT) { error = vn_vget_ino(pdp, ino, cnp->cn_lkflags, &tdp); if (VN_IS_DOOMED(pdp)) { if (error == 0) vput(tdp); error = ENOENT; } if (error) return (error); /* * Recheck that ".." entry in the vdp directory points * to the inode we looked up before vdp lock was * dropped. */ error = ext2_lookup_ino(pdp, NULL, cnp, &ino1); if (error) { vput(tdp); return (error); } if (ino1 != ino) { vput(tdp); goto restart; } *vpp = tdp; } else if (dp->i_number == ino) { VREF(vdp); /* we want ourself, ie "." */ /* * When we lookup "." we still can be asked to lock it * differently. */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(vdp)) { if (ltype == LK_EXCLUSIVE) vn_lock(vdp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(vdp, LK_DOWNGRADE | LK_RETRY); } *vpp = vdp; } else { if ((error = VFS_VGET(vdp->v_mount, ino, cnp->cn_lkflags, &tdp)) != 0) return (error); *vpp = tdp; } /* * Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); } int ext2_search_dirblock(struct inode *ip, void *data, int *foundp, const char *name, int namelen, int *entryoffsetinblockp, doff_t *offp, doff_t *prevoffp, doff_t *endusefulp, struct ext2fs_searchslot *ssp) { struct vnode *vdp; struct ext2fs_direct_2 *ep, *top; uint32_t bsize = ip->i_e2fs->e2fs_bsize; int offset = *entryoffsetinblockp; int namlen; vdp = ITOV(ip); ep = (struct ext2fs_direct_2 *)((char *)data + offset); top = (struct ext2fs_direct_2 *)((char *)data + bsize); while (ep < top) { if (ext2_check_direntry(vdp, ep, offset)) { int i; ext2_dirbad(ip, *offp, "mangled entry"); i = bsize - (offset & (bsize - 1)); *offp += i; offset += i; ep = (struct ext2fs_direct_2 *)((char *)data + offset); continue; } /* * If an appropriate sized slot has not yet been found, * check to see if one is available. Also accumulate space * in the current block so that we can determine if * compaction is viable. */ if (ssp->slotstatus != FOUND) { int size = le16toh(ep->e2d_reclen); if (ep->e2d_ino != 0) size -= EXT2_DIR_REC_LEN(ep->e2d_namlen); else if (ext2_is_dirent_tail(ip, ep)) size -= sizeof(struct ext2fs_direct_tail); if (size > 0) { if (size >= ssp->slotneeded) { ssp->slotstatus = FOUND; ssp->slotoffset = *offp; ssp->slotsize = le16toh(ep->e2d_reclen); } else if (ssp->slotstatus == NONE) { ssp->slotfreespace += size; if (ssp->slotoffset == -1) ssp->slotoffset = *offp; if (ssp->slotfreespace >= ssp->slotneeded) { ssp->slotstatus = COMPACT; ssp->slotsize = *offp + le16toh(ep->e2d_reclen) - ssp->slotoffset; } } } } /* * Check for a name match. */ if (ep->e2d_ino != 0) { namlen = ep->e2d_namlen; if (namlen == namelen && !bcmp(name, ep->e2d_name, (unsigned)namlen)) { /* * Save directory entry's inode number and * reclen in ndp->ni_ufs area, and release * directory buffer. */ *foundp = 1; return (0); } } *prevoffp = *offp; *offp += le16toh(ep->e2d_reclen); offset += le16toh(ep->e2d_reclen); *entryoffsetinblockp = offset; if (ep->e2d_ino != 0) *endusefulp = *offp; /* * Get pointer to the next entry. */ ep = (struct ext2fs_direct_2 *)((char *)data + offset); } return (0); } void ext2_dirbad(struct inode *ip, doff_t offset, char *how) { struct mount *mp; mp = ITOV(ip)->v_mount; if ((mp->mnt_flag & MNT_RDONLY) == 0) panic("ext2_dirbad: %s: bad dir ino %ju at offset %ld: %s\n", mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number, (long)offset, how); else SDT_PROBE4(ext2fs, , trace, ext2_dirbad_error, mp->mnt_stat.f_mntonname, ip->i_number, offset, how); } /* * Do consistency checking on a directory entry: * record length must be multiple of 4 * entry must fit in rest of its DIRBLKSIZ block * record must be large enough to contain entry * name is not longer than MAXNAMLEN * name must be as long as advertised, and null terminated */ static int ext2_check_direntry(struct vnode *dp, struct ext2fs_direct_2 *de, int entryoffsetinblock) { struct m_ext2fs *fs = VTOI(dp)->i_e2fs; char *error_msg = NULL; if (le16toh(de->e2d_reclen) < EXT2_DIR_REC_LEN(1)) error_msg = "rec_len is smaller than minimal"; else if (le16toh(de->e2d_reclen) % 4 != 0) error_msg = "rec_len % 4 != 0"; else if (le16toh(de->e2d_reclen) < EXT2_DIR_REC_LEN(de->e2d_namlen)) error_msg = "reclen is too small for name_len"; else if (entryoffsetinblock + le16toh(de->e2d_reclen)> fs->e2fs_bsize) error_msg = "directory entry across blocks"; else if (le32toh(de->e2d_ino) > fs->e2fs->e2fs_icount) error_msg = "directory entry inode out of bounds"; if (error_msg != NULL) { SDT_PROBE5(ext2fs, , trace, ext2_dirbadentry_error, error_msg, entryoffsetinblock, le32toh(de->e2d_ino), le16toh(de->e2d_reclen), de->e2d_namlen); } return (error_msg == NULL ? 0 : EINVAL); } /* * Insert an entry into the fresh directory block. * Initialize entry tail if the metadata_csum feature is turned on. */ static int ext2_add_first_entry(struct vnode *dvp, struct ext2fs_direct_2 *entry, struct componentname *cnp) { struct inode *dp; struct iovec aiov; struct uio auio; char* buf = NULL; int dirblksize, error; dp = VTOI(dvp); dirblksize = dp->i_e2fs->e2fs_bsize; if (dp->i_offset & (dirblksize - 1)) panic("ext2_add_first_entry: bad directory offset"); if (EXT2_HAS_RO_COMPAT_FEATURE(dp->i_e2fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { entry->e2d_reclen = htole16(dirblksize - sizeof(struct ext2fs_direct_tail)); buf = malloc(dirblksize, M_TEMP, M_WAITOK); memcpy(buf, entry, EXT2_DIR_REC_LEN(entry->e2d_namlen)); ext2_init_dirent_tail(EXT2_DIRENT_TAIL(buf, dirblksize)); ext2_dirent_csum_set(dp, (struct ext2fs_direct_2 *)buf); auio.uio_offset = dp->i_offset; auio.uio_resid = dirblksize; aiov.iov_len = auio.uio_resid; aiov.iov_base = (caddr_t)buf; } else { entry->e2d_reclen = htole16(dirblksize); auio.uio_offset = dp->i_offset; auio.uio_resid = EXT2_DIR_REC_LEN(entry->e2d_namlen); aiov.iov_len = auio.uio_resid; aiov.iov_base = (caddr_t)entry; } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = (struct thread *)0; error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred); if (error) goto out; dp->i_size = roundup2(dp->i_size, dirblksize); dp->i_flag |= IN_CHANGE; out: free(buf, M_TEMP); return (error); } /* * Write a directory entry after a call to namei, using the parameters * that it left in nameidata. The argument ip is the inode which the new * directory entry will refer to. Dvp is a pointer to the directory to * be written, which was left locked by namei. Remaining parameters * (dp->i_offset, dp->i_count) indicate how the space for the new * entry is to be obtained. */ int ext2_direnter(struct inode *ip, struct vnode *dvp, struct componentname *cnp) { struct inode *dp; struct ext2fs_direct_2 newdir; int DIRBLKSIZ = ip->i_e2fs->e2fs_bsize; int error; -#ifdef INVARIANTS - if ((cnp->cn_flags & SAVENAME) == 0) - panic("ext2_direnter: missing name"); -#endif dp = VTOI(dvp); newdir.e2d_ino = htole32(ip->i_number); if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs, EXT2F_INCOMPAT_FTYPE)) { newdir.e2d_namlen = cnp->cn_namelen; newdir.e2d_type = DTTOFT(IFTODT(ip->i_mode)); } else newdir.e2d_namlen = htole16(cnp->cn_namelen); bcopy(cnp->cn_nameptr, newdir.e2d_name, (unsigned)cnp->cn_namelen + 1); if (ext2_htree_has_idx(dp)) { error = ext2_htree_add_entry(dvp, &newdir, cnp); if (error) { dp->i_flag &= ~IN_E3INDEX; dp->i_flag |= IN_CHANGE | IN_UPDATE; } return (error); } if (EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_DIRHASHINDEX) && !ext2_htree_has_idx(dp)) { if ((dp->i_size / DIRBLKSIZ) == 1 && dp->i_offset == DIRBLKSIZ) { /* * Making indexed directory when one block is not * enough to save all entries. */ return ext2_htree_create_index(dvp, cnp, &newdir); } } /* * If dp->i_count is 0, then namei could find no * space in the directory. Here, dp->i_offset will * be on a directory block boundary and we will write the * new entry into a fresh block. */ if (dp->i_count == 0) return ext2_add_first_entry(dvp, &newdir, cnp); error = ext2_add_entry(dvp, &newdir); if (!error && dp->i_endoff && dp->i_endoff < dp->i_size) error = ext2_truncate(dvp, (off_t)dp->i_endoff, IO_SYNC, cnp->cn_cred, curthread); return (error); } /* * Insert an entry into the directory block. * Compact the contents. */ int ext2_add_entry(struct vnode *dvp, struct ext2fs_direct_2 *entry) { struct ext2fs_direct_2 *ep, *nep; struct inode *dp; struct buf *bp; u_int dsize; int error, loc, newentrysize, spacefree; char *dirbuf; dp = VTOI(dvp); /* * If dp->i_count is non-zero, then namei found space * for the new entry in the range dp->i_offset to * dp->i_offset + dp->i_count in the directory. * To use this space, we may have to compact the entries located * there, by copying them together towards the beginning of the * block, leaving the free space in one usable chunk at the end. */ /* * Increase size of directory if entry eats into new space. * This should never push the size past a new multiple of * DIRBLKSIZE. * * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. */ if (dp->i_offset + dp->i_count > dp->i_size) dp->i_size = dp->i_offset + dp->i_count; /* * Get the block containing the space for the new directory entry. */ if ((error = ext2_blkatoff(dvp, (off_t)dp->i_offset, &dirbuf, &bp)) != 0) return (error); /* * Find space for the new entry. In the simple case, the entry at * offset base will have the space. If it does not, then namei * arranged that compacting the region dp->i_offset to * dp->i_offset + dp->i_count would yield the * space. */ newentrysize = EXT2_DIR_REC_LEN(entry->e2d_namlen); ep = (struct ext2fs_direct_2 *)dirbuf; dsize = EXT2_DIR_REC_LEN(ep->e2d_namlen); spacefree = le16toh(ep->e2d_reclen) - dsize; for (loc = le16toh(ep->e2d_reclen); loc < dp->i_count; ) { nep = (struct ext2fs_direct_2 *)(dirbuf + loc); if (le32toh(ep->e2d_ino)) { /* trim the existing slot */ ep->e2d_reclen = htole16(dsize); ep = (struct ext2fs_direct_2 *)((char *)ep + dsize); } else { /* overwrite; nothing there; header is ours */ spacefree += dsize; } dsize = EXT2_DIR_REC_LEN(nep->e2d_namlen); spacefree += le16toh(nep->e2d_reclen) - dsize; loc += le16toh(nep->e2d_reclen); bcopy((caddr_t)nep, (caddr_t)ep, dsize); } /* * Update the pointer fields in the previous entry (if any), * copy in the new entry, and write out the block. */ if (ep->e2d_ino == 0) { if (spacefree + dsize < newentrysize) panic("ext2_direnter: compact1"); entry->e2d_reclen = htole16(spacefree + dsize); } else { if (spacefree < newentrysize) panic("ext2_direnter: compact2"); entry->e2d_reclen = htole16(spacefree); ep->e2d_reclen = htole16(dsize); ep = (struct ext2fs_direct_2 *)((char *)ep + dsize); } bcopy((caddr_t)entry, (caddr_t)ep, (u_int)newentrysize); ext2_dirent_csum_set(dp, (struct ext2fs_direct_2 *)bp->b_data); if (DOINGASYNC(dvp)) { bdwrite(bp); error = 0; } else { error = bwrite(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Remove a directory entry after a call to namei, using * the parameters which it left in nameidata. The entry * dp->i_offset contains the offset into the directory of the * entry to be eliminated. The dp->i_count field contains the * size of the previous record in the directory. If this * is 0, the first entry is being deleted, so we need only * zero the inode number to mark the entry as free. If the * entry is not the first in the directory, we must reclaim * the space of the now empty record by adding the record size * to the size of the previous entry. */ int ext2_dirremove(struct vnode *dvp, struct componentname *cnp) { struct inode *dp; struct ext2fs_direct_2 *ep, *rep; struct buf *bp; int error; dp = VTOI(dvp); if (dp->i_count == 0) { /* * First entry in block: set d_ino to zero. */ if ((error = ext2_blkatoff(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0) return (error); ep->e2d_ino = 0; ext2_dirent_csum_set(dp, (struct ext2fs_direct_2 *)bp->b_data); error = bwrite(bp); dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Collapse new free space into previous entry. */ if ((error = ext2_blkatoff(dvp, (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0) return (error); /* Set 'rep' to the entry being removed. */ if (dp->i_count == 0) rep = ep; else rep = (struct ext2fs_direct_2 *)((char *)ep + le16toh(ep->e2d_reclen)); ep->e2d_reclen += rep->e2d_reclen; ext2_dirent_csum_set(dp, (struct ext2fs_direct_2 *)bp->b_data); if (DOINGASYNC(dvp) && dp->i_count != 0) bdwrite(bp); else error = bwrite(bp); dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Rewrite an existing directory entry to point at the inode * supplied. The parameters describing the directory entry are * set up by a call to namei. */ int ext2_dirrewrite(struct inode *dp, struct inode *ip, struct componentname *cnp) { struct buf *bp; struct ext2fs_direct_2 *ep; struct vnode *vdp = ITOV(dp); int error; if ((error = ext2_blkatoff(vdp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0) return (error); ep->e2d_ino = htole32(ip->i_number); if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs, EXT2F_INCOMPAT_FTYPE)) ep->e2d_type = DTTOFT(IFTODT(ip->i_mode)); else ep->e2d_type = EXT2_FT_UNKNOWN; ext2_dirent_csum_set(dp, (struct ext2fs_direct_2 *)bp->b_data); error = bwrite(bp); dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Check if a directory is empty or not. * Inode supplied must be locked. * * Using a struct dirtemplate here is not precisely * what we want, but better than using a struct direct. * * NB: does not handle corrupted directories. */ int ext2_dirempty(struct inode *ip, ino_t parentino, struct ucred *cred) { off_t off; struct dirtemplate dbuf; struct ext2fs_direct_2 *dp = (struct ext2fs_direct_2 *)&dbuf; int error, namlen; ssize_t count; #define MINDIRSIZ (sizeof(struct dirtemplate) / 2) for (off = 0; off < ip->i_size; off += le16toh(dp->e2d_reclen)) { error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, &count, (struct thread *)0); /* * Since we read MINDIRSIZ, residual must * be 0 unless we're at end of file. */ if (error || count != 0) return (0); /* avoid infinite loops */ if (dp->e2d_reclen == 0) return (0); /* skip empty entries */ if (dp->e2d_ino == 0) continue; /* accept only "." and ".." */ namlen = dp->e2d_namlen; if (namlen > 2) return (0); if (dp->e2d_name[0] != '.') return (0); /* * At this point namlen must be 1 or 2. * 1 implies ".", 2 implies ".." if second * char is also "." */ if (namlen == 1) continue; if (dp->e2d_name[1] == '.' && le32toh(dp->e2d_ino) == parentino) continue; return (0); } return (1); } /* * Check if source directory is in the path of the target directory. * Target is supplied locked, source is unlocked. * The target is always vput before returning. */ int ext2_checkpath(struct inode *source, struct inode *target, struct ucred *cred) { struct vnode *vp; int error, namlen; struct dirtemplate dirbuf; vp = ITOV(target); if (target->i_number == source->i_number) { error = EEXIST; goto out; } if (target->i_number == EXT2_ROOTINO) { error = 0; goto out; } for (;;) { if (vp->v_type != VDIR) { error = ENOTDIR; break; } error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, sizeof(struct dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, NULL, NULL); if (error != 0) break; namlen = dirbuf.dotdot_type; /* like ufs little-endian */ if (namlen != 2 || dirbuf.dotdot_name[0] != '.' || dirbuf.dotdot_name[1] != '.') { error = ENOTDIR; break; } if (le32toh(dirbuf.dotdot_ino) == source->i_number) { error = EINVAL; break; } if (le32toh(dirbuf.dotdot_ino) == EXT2_ROOTINO) break; vput(vp); if ((error = VFS_VGET(vp->v_mount, le32toh(dirbuf.dotdot_ino), LK_EXCLUSIVE, &vp)) != 0) { vp = NULL; break; } } out: if (error == ENOTDIR) SDT_PROBE2(ext2fs, , lookup, trace, 1, "checkpath: .. not a directory"); if (vp != NULL) vput(vp); return (error); } diff --git a/sys/fs/ext2fs/ext2_vnops.c b/sys/fs/ext2fs/ext2_vnops.c index 9843fc16e6b2..b629ddefcd1a 100644 --- a/sys/fs/ext2fs/ext2_vnops.c +++ b/sys/fs/ext2fs/ext2_vnops.c @@ -1,2350 +1,2333 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.7 (Berkeley) 2/3/94 * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 * $FreeBSD$ */ #include "opt_suiddir.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DECLARE(ext2fs); /* * ext2fs trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(ext2fs, , vnops, trace, "int", "char*"); static int ext2_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *); static void ext2_itimes_locked(struct vnode *); static vop_access_t ext2_access; static int ext2_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ext2_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ext2_close; static vop_create_t ext2_create; static vop_fsync_t ext2_fsync; static vop_getattr_t ext2_getattr; static vop_ioctl_t ext2_ioctl; static vop_link_t ext2_link; static vop_mkdir_t ext2_mkdir; static vop_mknod_t ext2_mknod; static vop_open_t ext2_open; static vop_pathconf_t ext2_pathconf; static vop_print_t ext2_print; static vop_read_t ext2_read; static vop_readlink_t ext2_readlink; static vop_remove_t ext2_remove; static vop_rename_t ext2_rename; static vop_rmdir_t ext2_rmdir; static vop_setattr_t ext2_setattr; static vop_strategy_t ext2_strategy; static vop_symlink_t ext2_symlink; static vop_write_t ext2_write; static vop_deleteextattr_t ext2_deleteextattr; static vop_getextattr_t ext2_getextattr; static vop_listextattr_t ext2_listextattr; static vop_setextattr_t ext2_setextattr; static vop_vptofh_t ext2_vptofh; static vop_close_t ext2fifo_close; /* Global vfs data structures for ext2. */ struct vop_vector ext2_vnodeops = { .vop_default = &default_vnodeops, .vop_access = ext2_access, .vop_bmap = ext2_bmap, .vop_cachedlookup = ext2_lookup, .vop_close = ext2_close, .vop_create = ext2_create, .vop_fsync = ext2_fsync, .vop_getpages = vnode_pager_local_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_getattr = ext2_getattr, .vop_inactive = ext2_inactive, .vop_ioctl = ext2_ioctl, .vop_link = ext2_link, .vop_lookup = vfs_cache_lookup, .vop_mkdir = ext2_mkdir, .vop_mknod = ext2_mknod, .vop_open = ext2_open, .vop_pathconf = ext2_pathconf, .vop_poll = vop_stdpoll, .vop_print = ext2_print, .vop_read = ext2_read, .vop_readdir = ext2_readdir, .vop_readlink = ext2_readlink, .vop_reallocblks = ext2_reallocblks, .vop_reclaim = ext2_reclaim, .vop_remove = ext2_remove, .vop_rename = ext2_rename, .vop_rmdir = ext2_rmdir, .vop_setattr = ext2_setattr, .vop_strategy = ext2_strategy, .vop_symlink = ext2_symlink, .vop_write = ext2_write, .vop_deleteextattr = ext2_deleteextattr, .vop_getextattr = ext2_getextattr, .vop_listextattr = ext2_listextattr, .vop_setextattr = ext2_setextattr, #ifdef UFS_ACL .vop_getacl = ext2_getacl, .vop_setacl = ext2_setacl, .vop_aclcheck = ext2_aclcheck, #endif /* UFS_ACL */ .vop_vptofh = ext2_vptofh, }; VFS_VOP_VECTOR_REGISTER(ext2_vnodeops); struct vop_vector ext2_fifoops = { .vop_default = &fifo_specops, .vop_access = ext2_access, .vop_close = ext2fifo_close, .vop_fsync = ext2_fsync, .vop_getattr = ext2_getattr, .vop_inactive = ext2_inactive, .vop_pathconf = ext2_pathconf, .vop_print = ext2_print, .vop_read = VOP_PANIC, .vop_reclaim = ext2_reclaim, .vop_setattr = ext2_setattr, .vop_write = VOP_PANIC, .vop_vptofh = ext2_vptofh, }; VFS_VOP_VECTOR_REGISTER(ext2_fifoops); /* * A virgin directory (no blushing please). * Note that the type and namlen fields are reversed relative to ext2. * Also, we don't use `struct odirtemplate', since it would just cause * endianness problems. */ static struct dirtemplate mastertemplate = { 0, htole16(12), 1, EXT2_FT_DIR, ".", 0, htole16(DIRBLKSIZ - 12), 2, EXT2_FT_DIR, ".." }; static struct dirtemplate omastertemplate = { 0, htole16(12), 1, EXT2_FT_UNKNOWN, ".", 0, htole16(DIRBLKSIZ - 12), 2, EXT2_FT_UNKNOWN, ".." }; static void ext2_itimes_locked(struct vnode *vp) { struct inode *ip; struct timespec ts; ASSERT_VI_LOCKED(vp, __func__); ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_type == VBLK || vp->v_type == VCHR)) ip->i_flag |= IN_LAZYMOD; else ip->i_flag |= IN_MODIFIED; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { vfs_timestamp(&ts); if (ip->i_flag & IN_ACCESS) { ip->i_atime = ts.tv_sec; ip->i_atimensec = ts.tv_nsec; } if (ip->i_flag & IN_UPDATE) { ip->i_mtime = ts.tv_sec; ip->i_mtimensec = ts.tv_nsec; ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) { ip->i_ctime = ts.tv_sec; ip->i_ctimensec = ts.tv_nsec; } } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } void ext2_itimes(struct vnode *vp) { VI_LOCK(vp); ext2_itimes_locked(vp); VI_UNLOCK(vp); } /* * Create a regular file */ static int ext2_create(struct vop_create_args *ap) { int error; error = ext2_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); if (error != 0) return (error); if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp); return (0); } static int ext2_open(struct vop_open_args *ap) { if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); vnode_create_vobject(ap->a_vp, VTOI(ap->a_vp)->i_size, ap->a_td); return (0); } /* * Close called. * * Update the times on the inode. */ static int ext2_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; VI_LOCK(vp); if (vp->v_usecount > 1) ext2_itimes_locked(vp); VI_UNLOCK(vp); return (0); } static int ext2_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } /* If immutable bit set, nobody gets to write it. */ if ((accmode & VWRITE) && (ip->i_flags & SF_IMMUTABLE)) return (EPERM); error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_accmode, ap->a_cred); return (error); } static int ext2_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct vattr *vap = ap->a_vap; ext2_itimes(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ip->i_devvp->v_rdev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = ip->i_rdev; vap->va_size = ip->i_size; vap->va_atime.tv_sec = ip->i_atime; vap->va_atime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_atimensec : 0; vap->va_mtime.tv_sec = ip->i_mtime; vap->va_mtime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_mtimensec : 0; vap->va_ctime.tv_sec = ip->i_ctime; vap->va_ctime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_ctimensec : 0; if (E2DI_HAS_XTIME(ip)) { vap->va_birthtime.tv_sec = ip->i_birthtime; vap->va_birthtime.tv_nsec = ip->i_birthnsec; } vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_bytes = dbtob((u_quad_t)ip->i_blocks); vap->va_type = IFTOVT(ip->i_mode); vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ static int ext2_setattr(struct vop_setattr_args *ap) { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct thread *td = curthread; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { /* Disallow flags not supported by ext2fs. */ if (vap->va_flags & ~(SF_APPEND | SF_IMMUTABLE | UF_NODUMP)) return (EOPNOTSUPP); if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Unprivileged processes and privileged processes in * jail() are not permitted to unset system flags, or * modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } } else { if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND) || ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE)) return (EPERM); } ip->i_flags = vap->va_flags; ip->i_flag |= IN_CHANGE; if (ip->i_flags & (IMMUTABLE | APPEND)) return (0); } if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ext2_chown(vp, vap->va_uid, vap->va_gid, cred, td)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } if ((error = ext2_truncate(vp, vap->va_size, 0, cred, td)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * From utimes(2): * If times is NULL, ... The caller must be the owner of * the file, have permission to write the file, or be the * super-user. * If times is non-NULL, ... The caller must be the owner of * the file or be the super-user. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred, td)))) return (error); ip->i_flag |= IN_CHANGE | IN_MODIFIED; if (vap->va_atime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_ACCESS; ip->i_atime = vap->va_atime.tv_sec; ip->i_atimensec = vap->va_atime.tv_nsec; } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_UPDATE; ip->i_mtime = vap->va_mtime.tv_sec; ip->i_mtimensec = vap->va_mtime.tv_nsec; } if (E2DI_HAS_XTIME(ip) && vap->va_birthtime.tv_sec != VNOVAL) { ip->i_birthtime = vap->va_birthtime.tv_sec; ip->i_birthnsec = vap->va_birthtime.tv_nsec; } error = ext2_update(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = ext2_chmod(vp, (int)vap->va_mode, cred, td); } return (error); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ext2_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td) { struct inode *ip = VTOI(vp); int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { error = priv_check_cred(cred, PRIV_VFS_STICKYFILE); if (error) return (EFTYPE); } if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error) return (error); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ext2_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *td) { struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file * to a group of which we are not a member, the caller must * have privilege. */ if (uid != ip->i_uid || (gid != ip->i_gid && !groupmember(gid, cred))) { error = priv_check_cred(cred, PRIV_VFS_CHOWN); if (error) return (error); } ogid = ip->i_gid; ouid = ip->i_uid; ip->i_gid = gid; ip->i_uid = uid; ip->i_flag |= IN_CHANGE; if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID) != 0) ip->i_mode &= ~(ISUID | ISGID); } return (0); } /* * Synch an open file. */ /* ARGSUSED */ static int ext2_fsync(struct vop_fsync_args *ap) { /* * Flush all dirty buffers associated with a vnode. */ vop_stdfsync(ap); return (ext2_update(ap->a_vp, ap->a_waitfor == MNT_WAIT)); } static int ext2_check_mknod_limits(dev_t dev) { unsigned maj = major(dev); unsigned min = minor(dev); if (maj > EXT2_MAJOR_MAX || min > EXT2_MINOR_MAX) return (EINVAL); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ static int ext2_mknod(struct vop_mknod_args *ap) { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; if (vap->va_rdev != VNOVAL) { error = ext2_check_mknod_limits(vap->va_rdev); if (error) return (error); } error = ext2_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) ip->i_rdev = vap->va_rdev; /* * Remove inode, then reload it through VFS_VGET so it is * checked to see if it is an alias of an existing entry in * the inode cache. XXX I don't believe this is necessary now. */ (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); vput(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { *vpp = NULL; return (error); } return (0); } static int ext2_remove(struct vop_remove_args *ap) { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } error = ext2_dirremove(dvp, ap->a_cnp); if (error == 0) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: return (error); } /* * link vnode call */ static int ext2_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; int error; -#ifdef INVARIANTS - if ((cnp->cn_flags & HASBUF) == 0) - panic("ext2_link: no name"); -#endif ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= EXT4_LINK_MAX) { error = EMLINK; goto out; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } ip->i_nlink++; ip->i_flag |= IN_CHANGE; error = ext2_update(vp, !DOINGASYNC(vp)); if (!error) error = ext2_direnter(ip, tdvp, cnp); if (error) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: return (error); } static int ext2_inc_nlink(struct inode *ip) { ip->i_nlink++; if (S_ISDIR(ip->i_mode) && EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK) && ip->i_nlink > 1) { if (ip->i_nlink >= EXT4_LINK_MAX || ip->i_nlink == 2) ip->i_nlink = 1; } else if (ip->i_nlink > EXT4_LINK_MAX) { ip->i_nlink--; return (EMLINK); } return (0); } static void ext2_dec_nlink(struct inode *ip) { if (!S_ISDIR(ip->i_mode) || ip->i_nlink > 2) ip->i_nlink--; } /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ static int ext2_rename(struct vop_rename_args *ap) { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct inode *ip, *xp, *dp; struct dirtemplate *dirbuf; int doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0; u_char namlen; -#ifdef INVARIANTS - if ((tcnp->cn_flags & HASBUF) == 0 || - (fcnp->cn_flags & HASBUF) == 0) - panic("ext2_rename: no name"); -#endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto abortit; } /* * Renaming a file to itself has no effect. The upper layers should * not call us in that case. Temporarily just warn if they do. */ if (fvp == tvp) { SDT_PROBE2(ext2fs, , vnops, trace, 1, "rename: fvp == tvp (can't happen)"); error = 0; goto abortit; } if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto abortit; dp = VTOI(fdvp); ip = VTOI(fvp); if (ip->i_nlink >= EXT4_LINK_MAX && !EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) { VOP_UNLOCK(fvp); error = EMLINK; goto abortit; } if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { VOP_UNLOCK(fvp); error = EPERM; goto abortit; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || (ip->i_flag & IN_RENAME)) { VOP_UNLOCK(fvp); error = EINVAL; goto abortit; } ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory++; } vrele(fdvp); /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ ext2_inc_nlink(ip); ip->i_flag |= IN_CHANGE; if ((error = ext2_update(fvp, !DOINGASYNC(fvp))) != 0) { VOP_UNLOCK(fvp); goto bad; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to checkpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, curthread); VOP_UNLOCK(fvp); if (oldparent != dp->i_number) newparent = dp->i_number; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); error = ext2_checkpath(ip, dp, tcnp->cn_cred); if (error) goto out; VREF(tdvp); error = vfs_relookup(tdvp, &tvp, tcnp); if (error) goto out; vrele(tdvp); dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); } /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (xp == NULL) { if (dp->i_devvp != ip->i_devvp) panic("ext2_rename: EXDEV"); /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (doingdirectory && newparent) { error = ext2_inc_nlink(dp); if (error) goto bad; dp->i_flag |= IN_CHANGE; error = ext2_update(tdvp, !DOINGASYNC(tdvp)); if (error) goto bad; } error = ext2_direnter(ip, tdvp, tcnp); if (error) { if (doingdirectory && newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; (void)ext2_update(tdvp, 1); } goto bad; } vput(tdvp); } else { if (xp->i_devvp != dp->i_devvp || xp->i_devvp != ip->i_devvp) panic("ext2_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) panic("ext2_rename: same file"); /* * If the parent directory is "sticky", then the user must * own the parent directory, or the destination of the rename, * otherwise the destination may not be changed (except by * root). This implements append-only directories. */ if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && tcnp->cn_cred->cr_uid != dp->i_uid && xp->i_uid != tcnp->cn_cred->cr_uid) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode & IFMT) == IFDIR) { if (!ext2_dirempty(xp, dp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = ext2_dirrewrite(dp, ip, tcnp); if (error) goto bad; /* * If the target directory is in the same * directory as the source directory, * decrement the link count on the parent * of the target directory. */ if (doingdirectory && !newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; } vput(tdvp); /* * Adjust the link count of the target to * reflect the dirrewrite above. If this is * a directory it is empty and there are * no links to it, so we can squash the inode and * any space associated with it. We disallowed * renaming over top of a directory with links to * it above, as the remaining link would point to * a directory without "." or ".." entries. */ ext2_dec_nlink(xp); if (doingdirectory) { if (xp->i_nlink > 2) panic("ext2_rename: linked directory"); error = ext2_truncate(tvp, (off_t)0, IO_SYNC, tcnp->cn_cred, curthread); xp->i_nlink = 0; } xp->i_flag |= IN_CHANGE; vput(tvp); xp = NULL; } /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; VREF(fdvp); error = vfs_relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); } else { /* * From name has disappeared. IN_RENAME is not sufficient * to protect against directory races due to timing windows, * so we can't panic here. */ vrele(ap->a_fvp); return (0); } /* * Ensure that the directory entry still exists and has not * changed while the new name has been entered. If the source is * a file then the entry may have been unlinked or renamed. In * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; its link * count of three would cause a rmdir to fail with ENOTEMPTY. * The IN_RENAME flag ensures that it cannot be moved by another * rename. */ if (xp != ip) { /* * From name resolves to a different inode. IN_RENAME is * not sufficient protection against timing window races * so we can't panic here. */ } else { /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; dirbuf = malloc(dp->i_e2fs->e2fs_bsize, M_TEMP, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, fvp, (caddr_t)dirbuf, ip->i_e2fs->e2fs_bsize, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, tcnp->cn_cred, NOCRED, NULL, NULL); if (error == 0) { /* Like ufs little-endian: */ namlen = dirbuf->dotdot_type; if (namlen != 2 || dirbuf->dotdot_name[0] != '.' || dirbuf->dotdot_name[1] != '.') { ext2_dirbad(xp, (doff_t)12, "rename: mangled dir"); } else { dirbuf->dotdot_ino = htole32(newparent); /* * dirblock 0 could be htree root, * try both csum update functions. */ ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)dirbuf); ext2_dx_csum_set(ip, (struct ext2fs_direct_2 *)dirbuf); (void)vn_rdwr(UIO_WRITE, fvp, (caddr_t)dirbuf, ip->i_e2fs->e2fs_bsize, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, tcnp->cn_cred, NOCRED, NULL, NULL); cache_purge(fdvp); } } free(dirbuf, M_TEMP); } error = ext2_dirremove(fdvp, fcnp); if (!error) { ext2_dec_nlink(xp); xp->i_flag |= IN_CHANGE; } xp->i_flag &= ~IN_RENAME; } if (dp) vput(fdvp); if (xp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (xp) vput(ITOV(xp)); vput(ITOV(dp)); out: if (doingdirectory) ip->i_flag &= ~IN_RENAME; if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { ext2_dec_nlink(ip); ip->i_flag |= IN_CHANGE; ip->i_flag &= ~IN_RENAME; vput(fvp); } else vrele(fvp); return (error); } #ifdef UFS_ACL static int ext2_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, mode_t dmode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *dacl, *acl; acl = acl_alloc(M_WAITOK); dacl = acl_alloc(M_WAITOK); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); ip->i_mode = dmode; *dacl = *acl; ext2_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = dmode; error = 0; goto out; default: goto out; } error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); if (error == 0) error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ #ifdef DEBUG printf("ext2_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); #endif /* DEBUG */ break; default: goto out; } out: acl_free(acl); acl_free(dacl); return (error); } static int ext2_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, mode_t mode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *acl; acl = acl_alloc(M_WAITOK); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); ip->i_mode = mode; ext2_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = mode; error = 0; goto out; default: goto out; } error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()\n"); /* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); return (error); } #endif /* UFS_ACL */ /* * Mkdir system call */ static int ext2_mkdir(struct vop_mkdir_args *ap) { struct m_ext2fs *fs; struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; struct vnode *tvp; struct dirtemplate dirtemplate, *dtp; char *buf = NULL; int error, dmode; -#ifdef INVARIANTS - if ((cnp->cn_flags & HASBUF) == 0) - panic("ext2_mkdir: no name"); -#endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= EXT4_LINK_MAX && !EXT2_HAS_RO_COMPAT_FEATURE(dp->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ext2_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = ext2_valloc(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); fs = ip->i_e2fs; ip->i_gid = dp->i_gid; #ifdef SUIDDIR { /* * if we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; } else { ip->i_uid = cnp->cn_cred->cr_uid; } } #else ip->i_uid = cnp->cn_cred->cr_uid; #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_nlink = 2; if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; error = ext2_update(tvp, 1); /* * Bump link count in parent directory * to reflect work done below. Should * be done before reference is created * so reparation is possible if we crash. */ ext2_inc_nlink(dp); dp->i_flag |= IN_CHANGE; error = ext2_update(dvp, !DOINGASYNC(dvp)); if (error) goto bad; /* Initialize directory with "." and ".." from static template. */ if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs, EXT2F_INCOMPAT_FTYPE)) dtp = &mastertemplate; else dtp = &omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = htole32(ip->i_number); dirtemplate.dotdot_ino = htole32(dp->i_number); /* * note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE so let's * just redefine it - for this function only */ #undef DIRBLKSIZ #define DIRBLKSIZ VTOI(dvp)->i_e2fs->e2fs_bsize dirtemplate.dotdot_reclen = htole16(DIRBLKSIZ - 12); buf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK | M_ZERO); if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { dirtemplate.dotdot_reclen = htole16(le16toh(dirtemplate.dotdot_reclen) - sizeof(struct ext2fs_direct_tail)); ext2_init_dirent_tail(EXT2_DIRENT_TAIL(buf, DIRBLKSIZ)); } memcpy(buf, &dirtemplate, sizeof(dirtemplate)); ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)buf); error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)buf, DIRBLKSIZ, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, cnp->cn_cred, NOCRED, NULL, NULL); if (error) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; goto bad; } if (DIRBLKSIZ > VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) /* XXX should grow with balloc() */ panic("ext2_mkdir: blksize"); else { ip->i_size = DIRBLKSIZ; ip->i_flag |= IN_CHANGE; } #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ext2_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, cnp->cn_cred, curthread); if (error) goto bad; } #endif /* UFS_ACL */ /* Directory set up, now install its entry in the parent directory. */ error = ext2_direnter(ip, dvp, cnp); if (error) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; } bad: /* * No need to do an explicit VOP_TRUNCATE here, vrele will do this * for us because we set the link count to 0. */ if (error) { ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); } else *ap->a_vpp = tvp; out: free(buf, M_TEMP); return (error); #undef DIRBLKSIZ #define DIRBLKSIZ DEV_BSIZE } /* * Rmdir system call. */ static int ext2_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error; ip = VTOI(vp); dp = VTOI(dvp); /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ if (!ext2_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ error = ext2_dirremove(dvp, cnp); if (error) goto out; ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; cache_purge(dvp); VOP_UNLOCK(dvp); /* * Truncate inode. The only stuff left * in the directory is "." and "..". */ ip->i_nlink = 0; error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred, curthread); cache_purge(ITOV(ip)); if (vn_lock(dvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(vp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } out: return (error); } /* * symlink -- make a symbolic link */ static int ext2_symlink(struct vop_symlink_args *ap) { struct vnode *vp, **vpp = ap->a_vpp; struct inode *ip; int len, error; error = ext2_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); if (len < VFSTOEXT2(vp->v_mount)->um_e2fs->e2fs_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, (char *)ip->i_shortlink, len); ip->i_size = len; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target), len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, ap->a_cnp->cn_cred, NOCRED, NULL, NULL); if (error) vput(vp); return (error); } /* * Return target name of a symbolic link */ static int ext2_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); int isize; isize = ip->i_size; if (isize < VFSTOEXT2(vp->v_mount)->um_e2fs->e2fs_maxsymlinklen) { uiomove((char *)ip->i_shortlink, isize, ap->a_uio); return (0); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the ext2_bmaparray() operation may not * deadlock on memory. See ext2_bmap() for details. */ static int ext2_strategy(struct vop_strategy_args *ap) { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct bufobj *bo; daddr_t blkno; int error; if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ext2_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { if (VTOI(ap->a_vp)->i_flag & IN_E4EXTENTS) error = ext4_bmapext(vp, bp->b_lblkno, &blkno, NULL, NULL); else error = ext2_bmaparray(vp, bp->b_lblkno, &blkno, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); bo = VFSTOEXT2(vp->v_mount)->um_bo; BO_STRATEGY(bo, bp); return (0); } /* * Print out the contents of an inode. */ static int ext2_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); vn_printf(ip->i_devvp, "\tino %ju", (uintmax_t)ip->i_number); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int ext2fifo_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; VI_LOCK(vp); if (vp->v_usecount > 1) ext2_itimes_locked(vp); VI_UNLOCK(vp); return (fifo_specops.vop_close(ap)); } /* * Return POSIX pathconf information applicable to ext2 filesystems. */ static int ext2_pathconf(struct vop_pathconf_args *ap) { int error = 0; switch (ap->a_name) { case _PC_LINK_MAX: if (EXT2_HAS_RO_COMPAT_FEATURE(VTOI(ap->a_vp)->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) *ap->a_retval = INT_MAX; else *ap->a_retval = EXT4_LINK_MAX; break; case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; break; case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) *ap->a_retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; break; case _PC_NO_TRUNC: *ap->a_retval = 1; break; #ifdef UFS_ACL case _PC_ACL_EXTENDED: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; break; case _PC_ACL_PATH_MAX: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; break; #endif /* UFS_ACL */ case _PC_MIN_HOLE_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_PRIO_IO: *ap->a_retval = 0; break; case _PC_SYNC_IO: *ap->a_retval = 0; break; case _PC_ALLOC_SIZE_MIN: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; break; case _PC_FILESIZEBITS: *ap->a_retval = 64; break; case _PC_REC_INCR_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_MAX_XFER_SIZE: *ap->a_retval = -1; /* means ``unlimited'' */ break; case _PC_REC_MIN_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_XFER_ALIGN: *ap->a_retval = PAGE_SIZE; break; case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; break; default: error = vop_stdpathconf(ap); break; } return (error); } /* * Vnode operation to remove a named attribute. */ static int ext2_deleteextattr(struct vop_deleteextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) return (error); error = ENOATTR; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_delete(ip, ap->a_attrnamespace, ap->a_name); if (error != ENOATTR) return (error); } if (ip->i_facl) error = ext2_extattr_block_delete(ip, ap->a_attrnamespace, ap->a_name); return (error); } /* * Vnode operation to retrieve a named extended attribute. */ static int ext2_getextattr(struct vop_getextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); if (ap->a_size != NULL) *ap->a_size = 0; error = ENOATTR; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_get(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size); if (error != ENOATTR) return (error); } if (ip->i_facl) error = ext2_extattr_block_get(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int ext2_listextattr(struct vop_listextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); if (ap->a_size != NULL) *ap->a_size = 0; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_list(ip, ap->a_attrnamespace, ap->a_uio, ap->a_size); if (error) return (error); } if (ip->i_facl) error = ext2_extattr_block_list(ip, ap->a_attrnamespace, ap->a_uio, ap->a_size); return (error); } /* * Vnode operation to set a named attribute. */ static int ext2_setextattr(struct vop_setextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) return (error); error = ext2_extattr_valid_attrname(ap->a_attrnamespace, ap->a_name); if (error) return (error); if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_set(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio); if (error != ENOSPC) return (error); } error = ext2_extattr_block_set(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio); return (error); } /* * Vnode pointer to File handle */ /* ARGSUSED */ static int ext2_vptofh(struct vop_vptofh_args *ap) { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ext2_vinit(struct mount *mntp, struct vop_vector *fifoops, struct vnode **vpp) { struct inode *ip; struct vnode *vp; vp = *vpp; ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); /* * Only unallocated inodes should be of type VNON. */ if (ip->i_mode != 0 && vp->v_type == VNON) return (EINVAL); if (vp->v_type == VFIFO) vp->v_op = fifoops; if (ip->i_number == EXT2_ROOTINO) vp->v_vflag |= VV_ROOT; ip->i_modrev = init_va_filerev(); *vpp = vp; return (0); } /* * Allocate a new inode. */ static int ext2_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct inode *ip, *pdir; struct vnode *tvp; int error; pdir = VTOI(dvp); -#ifdef INVARIANTS - if ((cnp->cn_flags & HASBUF) == 0) - panic("ext2_makeinode: no name"); -#endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; error = ext2_valloc(dvp, mode, cnp->cn_cred, &tvp); if (error) { return (error); } ip = VTOI(tvp); ip->i_gid = pdir->i_gid; #ifdef SUIDDIR { /* * if we are * not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; mode &= ~07111; } else { ip->i_uid = cnp->cn_cred->cr_uid; } } #else ip->i_uid = cnp->cn_cred->cr_uid; #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_nlink = 1; if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred)) { if (priv_check_cred(cnp->cn_cred, PRIV_VFS_RETAINSUGID)) ip->i_mode &= ~ISGID; } if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Make sure inode goes to disk before directory entry. */ error = ext2_update(tvp, !DOINGASYNC(tvp)); if (error) goto bad; #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ext2_do_posix1e_acl_inheritance_file(dvp, tvp, mode, cnp->cn_cred, curthread); if (error) goto bad; } #endif /* UFS_ACL */ error = ext2_direnter(ip, dvp, cnp); if (error) goto bad; *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); return (error); } /* * Vnode op for reading. */ static int ext2_read(struct vop_read_args *ap) { struct vnode *vp; struct inode *ip; struct uio *uio; struct m_ext2fs *fs; struct buf *bp; daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error, orig_resid, seqcount; int ioflag; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_READ) panic("%s: mode", "ext2_read"); if (vp->v_type == VLNK) { if ((int)ip->i_size < VFSTOEXT2(vp->v_mount)->um_e2fs->e2fs_maxsymlinklen) panic("%s: short symlink", "ext2_read"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("%s: type %d", "ext2_read", vp->v_type); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ext2_read: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0")); fs = ip->i_e2fs; if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->e2fs_maxfilesize) return (EOVERFLOW); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = blksize(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->e2fs_fsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, seqcount, 0, &bp); } else if (seqcount > 1) { u_int nextsize = blksize(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); if (error) { brelse(bp); bp = NULL; break; } /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; vfs_bio_brelse(bp, ioflag); } /* * This can only happen in the case of an error because the loop * above resets bp to NULL on each iteration and on normal * completion has not set a new value into it. so it must have come * from a 'break' statement */ if (bp != NULL) vfs_bio_brelse(bp, ioflag); if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) ip->i_flag |= IN_ACCESS; return (error); } static int ext2_ioctl(struct vop_ioctl_args *ap) { struct vnode *vp; int error; vp = ap->a_vp; switch (ap->a_command) { case FIOSEEKDATA: if (!(VTOI(vp)->i_flag & IN_E4EXTENTS)) { error = vn_lock(vp, LK_SHARED); if (error == 0) { error = ext2_bmap_seekdata(vp, (off_t *)ap->a_data); VOP_UNLOCK(vp); } else error = EBADF; return (error); } case FIOSEEKHOLE: return (vn_bmap_seekhole(vp, ap->a_command, (off_t *)ap->a_data, ap->a_cred)); default: return (ENOTTY); } } /* * Vnode op for writing. */ static int ext2_write(struct vop_write_args *ap) { struct vnode *vp; struct uio *uio; struct inode *ip; struct m_ext2fs *fs; struct buf *bp; daddr_t lbn; off_t osize; int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; seqcount = ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE) panic("%s: mode", "ext2_write"); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: /* XXX differs from ffs -- this is called from ext2_mkdir(). */ if ((ioflag & IO_SYNC) == 0) panic("ext2_write: nonsync dir write"); break; default: panic("ext2_write: type %p %d (%jd,%jd)", (void *)vp, vp->v_type, (intmax_t)uio->uio_offset, (intmax_t)uio->uio_resid); } KASSERT(uio->uio_resid >= 0, ("ext2_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ext2_write: uio->uio_offset < 0")); fs = ip->i_e2fs; if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->e2fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); resid = uio->uio_resid; osize = ip->i_size; if (seqcount > BA_SEQMAX) flags = BA_SEQMAX << BA_SEQSHIFT; else flags = seqcount << BA_SEQSHIFT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->e2fs_fsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->e2fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; error = ext2_balloc(ip, lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); if (error != 0) break; if ((ioflag & (IO_SYNC | IO_INVAL)) == (IO_SYNC | IO_INVAL)) bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) ip->i_size = uio->uio_offset + xfersize; size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland mmap. * * Note that we need only clear buffers with a transfer size * equal to the block size because buffers with a shorter * transfer size were cleared above by the call to ext2_balloc() * with the BA_CLRBUF flag set. * * If the source region for uiomove identically mmaps the * buffer, uiomove() performed the NOP copy, and the buffer * content remains valid because the page fault handler * validated the pages. */ if (error != 0 && (bp->b_flags & B_CACHE) == 0 && fs->e2fs_bsize == xfersize) vfs_bio_clrbuf(bp); vfs_bio_set_flags(bp, ioflag); /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->e2fs_fsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, &ip->i_clusterw, bp, ip->i_size, seqcount, 0); } else { bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) ip->i_mode &= ~(ISUID | ISGID); } if (error) { if (ioflag & IO_UNIT) { (void)ext2_truncate(vp, osize, ioflag & IO_SYNC, ap->a_cred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } if (uio->uio_resid != resid) { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (ioflag & IO_SYNC) error = ext2_update(vp, 1); } return (error); } diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 845ea04eca93..e89b831a4859 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -1,3182 +1,3168 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Copyright (c) 2019 The FreeBSD Foundation * * Portions of this software were developed by BFF Storage Systems, LLC under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_io.h" #include /* Maximum number of hardlinks to a single FUSE file */ #define FUSE_LINK_MAX UINT32_MAX SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*"); /* vnode ops */ static vop_access_t fuse_vnop_access; static vop_advlock_t fuse_vnop_advlock; static vop_allocate_t fuse_vnop_allocate; static vop_bmap_t fuse_vnop_bmap; static vop_close_t fuse_fifo_close; static vop_close_t fuse_vnop_close; static vop_copy_file_range_t fuse_vnop_copy_file_range; static vop_create_t fuse_vnop_create; static vop_deallocate_t fuse_vnop_deallocate; static vop_deleteextattr_t fuse_vnop_deleteextattr; static vop_fdatasync_t fuse_vnop_fdatasync; static vop_fsync_t fuse_vnop_fsync; static vop_getattr_t fuse_vnop_getattr; static vop_getextattr_t fuse_vnop_getextattr; static vop_inactive_t fuse_vnop_inactive; static vop_ioctl_t fuse_vnop_ioctl; static vop_link_t fuse_vnop_link; static vop_listextattr_t fuse_vnop_listextattr; static vop_lookup_t fuse_vnop_lookup; static vop_mkdir_t fuse_vnop_mkdir; static vop_mknod_t fuse_vnop_mknod; static vop_open_t fuse_vnop_open; static vop_pathconf_t fuse_vnop_pathconf; static vop_read_t fuse_vnop_read; static vop_readdir_t fuse_vnop_readdir; static vop_readlink_t fuse_vnop_readlink; static vop_reclaim_t fuse_vnop_reclaim; static vop_remove_t fuse_vnop_remove; static vop_rename_t fuse_vnop_rename; static vop_rmdir_t fuse_vnop_rmdir; static vop_setattr_t fuse_vnop_setattr; static vop_setextattr_t fuse_vnop_setextattr; static vop_strategy_t fuse_vnop_strategy; static vop_symlink_t fuse_vnop_symlink; static vop_write_t fuse_vnop_write; static vop_getpages_t fuse_vnop_getpages; static vop_print_t fuse_vnop_print; static vop_vptofh_t fuse_vnop_vptofh; struct vop_vector fuse_fifoops = { .vop_default = &fifo_specops, .vop_access = fuse_vnop_access, .vop_close = fuse_fifo_close, .vop_fsync = fuse_vnop_fsync, .vop_getattr = fuse_vnop_getattr, .vop_inactive = fuse_vnop_inactive, .vop_pathconf = fuse_vnop_pathconf, .vop_print = fuse_vnop_print, .vop_read = VOP_PANIC, .vop_reclaim = fuse_vnop_reclaim, .vop_setattr = fuse_vnop_setattr, .vop_write = VOP_PANIC, .vop_vptofh = fuse_vnop_vptofh, }; VFS_VOP_VECTOR_REGISTER(fuse_fifoops); struct vop_vector fuse_vnops = { .vop_allocate = fuse_vnop_allocate, .vop_default = &default_vnodeops, .vop_access = fuse_vnop_access, .vop_advlock = fuse_vnop_advlock, .vop_bmap = fuse_vnop_bmap, .vop_close = fuse_vnop_close, .vop_copy_file_range = fuse_vnop_copy_file_range, .vop_create = fuse_vnop_create, .vop_deallocate = fuse_vnop_deallocate, .vop_deleteextattr = fuse_vnop_deleteextattr, .vop_fsync = fuse_vnop_fsync, .vop_fdatasync = fuse_vnop_fdatasync, .vop_getattr = fuse_vnop_getattr, .vop_getextattr = fuse_vnop_getextattr, .vop_inactive = fuse_vnop_inactive, .vop_ioctl = fuse_vnop_ioctl, .vop_link = fuse_vnop_link, .vop_listextattr = fuse_vnop_listextattr, .vop_lookup = fuse_vnop_lookup, .vop_mkdir = fuse_vnop_mkdir, .vop_mknod = fuse_vnop_mknod, .vop_open = fuse_vnop_open, .vop_pathconf = fuse_vnop_pathconf, /* * TODO: implement vop_poll after upgrading to protocol 7.21. * FUSE_POLL was added in protocol 7.11, but it's kind of broken until * 7.21, which adds the ability for the client to choose which poll * events it wants, and for a client to deregister a file handle */ .vop_read = fuse_vnop_read, .vop_readdir = fuse_vnop_readdir, .vop_readlink = fuse_vnop_readlink, .vop_reclaim = fuse_vnop_reclaim, .vop_remove = fuse_vnop_remove, .vop_rename = fuse_vnop_rename, .vop_rmdir = fuse_vnop_rmdir, .vop_setattr = fuse_vnop_setattr, .vop_setextattr = fuse_vnop_setextattr, .vop_strategy = fuse_vnop_strategy, .vop_symlink = fuse_vnop_symlink, .vop_write = fuse_vnop_write, .vop_getpages = fuse_vnop_getpages, .vop_print = fuse_vnop_print, .vop_vptofh = fuse_vnop_vptofh, }; VFS_VOP_VECTOR_REGISTER(fuse_vnops); uma_zone_t fuse_pbuf_zone; /* Check permission for extattr operations, much like extattr_check_cred */ static int fuse_extattr_check_cred(struct vnode *vp, int ns, struct ucred *cred, struct thread *td, accmode_t accmode) { struct mount *mp = vnode_mount(vp); struct fuse_data *data = fuse_get_mpdata(mp); int default_permissions = data->dataflags & FSESS_DEFAULT_PERMISSIONS; /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (ns) { case EXTATTR_NAMESPACE_SYSTEM: if (default_permissions) { return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); } return (0); case EXTATTR_NAMESPACE_USER: if (default_permissions) { return (fuse_internal_access(vp, accmode, td, cred)); } return (0); default: return (EPERM); } } /* Get a filehandle for a directory */ static int fuse_filehandle_get_dir(struct vnode *vp, struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid) { if (fuse_filehandle_get(vp, FREAD, fufhp, cred, pid) == 0) return 0; return fuse_filehandle_get(vp, FEXEC, fufhp, cred, pid); } /* Send FUSE_FLUSH for this vnode */ static int fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) { struct fuse_flush_in *ffi; struct fuse_filehandle *fufh; struct fuse_dispatcher fdi; struct thread *td = curthread; struct mount *mp = vnode_mount(vp); int err; if (fsess_not_impl(vnode_mount(vp), FUSE_FLUSH)) return 0; err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); if (err) return err; fdisp_init(&fdi, sizeof(*ffi)); fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred); ffi = fdi.indata; ffi->fh = fufh->fh_id; /* * If the file has a POSIX lock then we're supposed to set lock_owner. * If not, then lock_owner is undefined. So we may as well always set * it. */ ffi->lock_owner = td->td_proc->p_pid; err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_FLUSH); err = 0; } fdisp_destroy(&fdi); return err; } /* Close wrapper for fifos. */ static int fuse_fifo_close(struct vop_close_args *ap) { return (fifo_specops.vop_close(ap)); } /* Invalidate a range of cached data, whether dirty of not */ static int fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end) { struct buf *bp; daddr_t left_lbn, end_lbn, right_lbn; off_t new_filesize; int iosize, left_on, right_on, right_blksize; iosize = fuse_iosize(vp); left_lbn = start / iosize; end_lbn = howmany(end, iosize); left_on = start & (iosize - 1); if (left_on != 0) { bp = getblk(vp, left_lbn, iosize, PCATCH, 0, 0); if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyend >= left_on) { /* * Flush the dirty buffer, because we don't have a * byte-granular way to record which parts of the * buffer are valid. */ bwrite(bp); if (bp->b_error) return (bp->b_error); } else { brelse(bp); } } right_on = end & (iosize - 1); if (right_on != 0) { right_lbn = end / iosize; new_filesize = MAX(filesize, end); right_blksize = MIN(iosize, new_filesize - iosize * right_lbn); bp = getblk(vp, right_lbn, right_blksize, PCATCH, 0, 0); if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyoff < right_on) { /* * Flush the dirty buffer, because we don't have a * byte-granular way to record which parts of the * buffer are valid. */ bwrite(bp); if (bp->b_error) return (bp->b_error); } else { brelse(bp); } } v_inval_buf_range(vp, left_lbn, end_lbn, iosize); return (0); } /* Send FUSE_LSEEK for this node */ static int fuse_vnop_do_lseek(struct vnode *vp, struct thread *td, struct ucred *cred, pid_t pid, off_t *offp, int whence) { struct fuse_dispatcher fdi; struct fuse_filehandle *fufh; struct fuse_lseek_in *flsi; struct fuse_lseek_out *flso; struct mount *mp = vnode_mount(vp); int err; ASSERT_VOP_LOCKED(vp, __func__); err = fuse_filehandle_getrw(vp, FREAD, &fufh, cred, pid); if (err) return (err); fdisp_init(&fdi, sizeof(*flsi)); fdisp_make_vp(&fdi, FUSE_LSEEK, vp, td, cred); flsi = fdi.indata; flsi->fh = fufh->fh_id; flsi->offset = *offp; flsi->whence = whence; err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LSEEK); } else if (err == 0) { fsess_set_impl(mp, FUSE_LSEEK); flso = fdi.answ; *offp = flso->offset; } fdisp_destroy(&fdi); return (err); } /* struct vnop_access_args { struct vnode *a_vp; #if VOP_ACCESS_TAKES_ACCMODE_T accmode_t a_accmode; #else int a_mode; #endif struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; int accmode = ap->a_accmode; struct ucred *cred = ap->a_cred; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); int err; if (fuse_isdeadfs(vp)) { if (vnode_isvroot(vp)) { return 0; } return ENXIO; } if (!(data->dataflags & FSESS_INITED)) { if (vnode_isvroot(vp)) { if (priv_check_cred(cred, PRIV_VFS_ADMIN) || (fuse_match_cred(data->daemoncred, cred) == 0)) { return 0; } } return EBADF; } if (vnode_islnk(vp)) { return 0; } err = fuse_internal_access(vp, accmode, ap->a_td, ap->a_cred); return err; } /* * struct vop_advlock_args { * struct vop_generic_args a_gen; * struct vnode *a_vp; * void *a_id; * int a_op; * struct flock *a_fl; * int a_flags; * } */ static int fuse_vnop_advlock(struct vop_advlock_args *ap) { struct vnode *vp = ap->a_vp; struct flock *fl = ap->a_fl; struct thread *td = curthread; struct ucred *cred = td->td_ucred; pid_t pid = td->td_proc->p_pid; struct fuse_filehandle *fufh; struct fuse_dispatcher fdi; struct fuse_lk_in *fli; struct fuse_lk_out *flo; enum fuse_opcode op; int dataflags, err; int flags = ap->a_flags; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; if (fuse_isdeadfs(vp)) { return ENXIO; } switch(ap->a_op) { case F_GETLK: op = FUSE_GETLK; break; case F_SETLK: if (flags & F_WAIT) op = FUSE_SETLKW; else op = FUSE_SETLK; break; case F_UNLCK: op = FUSE_SETLK; break; default: return EINVAL; } if (!(dataflags & FSESS_POSIX_LOCKS)) return vop_stdadvlock(ap); /* FUSE doesn't properly support flock until protocol 7.17 */ if (flags & F_FLOCK) return vop_stdadvlock(ap); vn_lock(vp, LK_SHARED | LK_RETRY); err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid); if (err) goto out; fdisp_init(&fdi, sizeof(*fli)); fdisp_make_vp(&fdi, op, vp, td, cred); fli = fdi.indata; fli->fh = fufh->fh_id; fli->owner = td->td_proc->p_pid; fli->lk.start = fl->l_start; if (fl->l_len != 0) fli->lk.end = fl->l_start + fl->l_len - 1; else fli->lk.end = INT64_MAX; fli->lk.type = fl->l_type; fli->lk.pid = td->td_proc->p_pid; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); if (err == 0 && op == FUSE_GETLK) { flo = fdi.answ; fl->l_type = flo->lk.type; fl->l_pid = flo->lk.pid; if (flo->lk.type != F_UNLCK) { fl->l_start = flo->lk.start; if (flo->lk.end == INT64_MAX) fl->l_len = 0; else fl->l_len = flo->lk.end - flo->lk.start + 1; fl->l_start = flo->lk.start; } } out: VOP_UNLOCK(vp); return err; } static int fuse_vnop_allocate(struct vop_allocate_args *ap) { struct vnode *vp = ap->a_vp; off_t *len = ap->a_len; off_t *offset = ap->a_offset; struct ucred *cred = ap->a_cred; struct fuse_filehandle *fufh; struct mount *mp = vnode_mount(vp); struct fuse_dispatcher fdi; struct fuse_fallocate_in *ffi; struct uio io; pid_t pid = curthread->td_proc->p_pid; struct fuse_vnode_data *fvdat = VTOFUD(vp); off_t filesize; int err; if (fuse_isdeadfs(vp)) return (ENXIO); switch (vp->v_type) { case VFIFO: return (ESPIPE); case VLNK: case VREG: if (vfs_isrdonly(mp)) return (EROFS); break; default: return (ENODEV); } if (vfs_isrdonly(mp)) return (EROFS); if (fsess_not_impl(mp, FUSE_FALLOCATE)) return (EINVAL); io.uio_offset = *offset; io.uio_resid = *len; err = vn_rlimit_fsize(vp, &io, curthread); if (err) return (err); err = fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); if (err) return (err); fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) return (err); fuse_inval_buf_range(vp, filesize, *offset, *offset + *len); fdisp_init(&fdi, sizeof(*ffi)); fdisp_make_vp(&fdi, FUSE_FALLOCATE, vp, curthread, cred); ffi = fdi.indata; ffi->fh = fufh->fh_id; ffi->offset = *offset; ffi->length = *len; ffi->mode = 0; err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_FALLOCATE); err = EINVAL; } else if (err == EOPNOTSUPP) { /* * The file system server does not support FUSE_FALLOCATE with * the supplied mode for this particular file. */ err = EINVAL; } else if (!err) { *offset += *len; *len = 0; fuse_vnode_undirty_cached_timestamps(vp, false); fuse_internal_clear_suid_on_write(vp, cred, curthread); if (*offset > fvdat->cached_attrs.va_size) { fuse_vnode_setsize(vp, *offset, false); getnanouptime(&fvdat->last_local_modify); } } return (err); } /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ static int fuse_vnop_bmap(struct vop_bmap_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj **bo = ap->a_bop; struct thread *td = curthread; struct mount *mp; struct fuse_dispatcher fdi; struct fuse_bmap_in *fbi; struct fuse_bmap_out *fbo; struct fuse_data *data; struct fuse_vnode_data *fvdat = VTOFUD(vp); uint64_t biosize; off_t fsize; daddr_t lbn = ap->a_bn; daddr_t *pbn = ap->a_bnp; int *runp = ap->a_runp; int *runb = ap->a_runb; int error = 0; int maxrun; if (fuse_isdeadfs(vp)) { return ENXIO; } mp = vnode_mount(vp); data = fuse_get_mpdata(mp); biosize = fuse_iosize(vp); maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1, data->max_readahead_blocks); if (bo != NULL) *bo = &vp->v_bufobj; /* * The FUSE_BMAP operation does not include the runp and runb * variables, so we must guess. Report nonzero contiguous runs so * cluster_read will combine adjacent reads. It's worthwhile to reduce * upcalls even if we don't know the true physical layout of the file. * * FUSE file systems may opt out of read clustering in two ways: * * mounting with -onoclusterr * * Setting max_readahead <= maxbcachebuf during FUSE_INIT */ if (runb != NULL) *runb = MIN(lbn, maxrun); if (runp != NULL && maxrun == 0) *runp = 0; else if (runp != NULL) { /* * If the file's size is cached, use that value to calculate * runp, even if the cache is expired. runp is only advisory, * and the risk of getting it wrong is not worth the cost of * another upcall. */ if (fvdat->cached_attrs.va_size != VNOVAL) fsize = fvdat->cached_attrs.va_size; else error = fuse_vnode_size(vp, &fsize, td->td_ucred, td); if (error == 0) *runp = MIN(MAX(0, fsize / (off_t)biosize - lbn - 1), maxrun); else *runp = 0; } if (fsess_maybe_impl(mp, FUSE_BMAP)) { fdisp_init(&fdi, sizeof(*fbi)); fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred); fbi = fdi.indata; fbi->block = lbn; fbi->blocksize = biosize; error = fdisp_wait_answ(&fdi); if (error == ENOSYS) { fdisp_destroy(&fdi); fsess_set_notimpl(mp, FUSE_BMAP); error = 0; } else { fbo = fdi.answ; if (error == 0 && pbn != NULL) *pbn = fbo->block; fdisp_destroy(&fdi); return error; } } /* If the daemon doesn't support BMAP, make up a sensible default */ if (pbn != NULL) *pbn = lbn * btodb(biosize); return (error); } /* struct vop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct ucred *cred = ap->a_cred; int fflag = ap->a_fflag; struct thread *td = ap->a_td; pid_t pid = td->td_proc->p_pid; struct fuse_vnode_data *fvdat = VTOFUD(vp); int err = 0; if (fuse_isdeadfs(vp)) return 0; if (vnode_isdir(vp)) return 0; if (fflag & IO_NDELAY) return 0; err = fuse_flush(vp, cred, pid, fflag); if (err == 0 && (fvdat->flag & FN_ATIMECHANGE)) { struct vattr vap; VATTR_NULL(&vap); vap.va_atime = fvdat->cached_attrs.va_atime; err = fuse_internal_setattr(vp, &vap, td, NULL); } /* TODO: close the file handle, if we're sure it's no longer used */ if ((fvdat->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, cred, td->td_proc->p_pid); } return err; } /* struct vop_copy_file_range_args { struct vop_generic_args a_gen; struct vnode *a_invp; off_t *a_inoffp; struct vnode *a_outvp; off_t *a_outoffp; size_t *a_lenp; unsigned int a_flags; struct ucred *a_incred; struct ucred *a_outcred; struct thread *a_fsizetd; } */ static int fuse_vnop_copy_file_range(struct vop_copy_file_range_args *ap) { struct vnode *invp = ap->a_invp; struct vnode *outvp = ap->a_outvp; struct mount *mp = vnode_mount(invp); struct fuse_vnode_data *outfvdat = VTOFUD(outvp); struct fuse_dispatcher fdi; struct fuse_filehandle *infufh, *outfufh; struct fuse_copy_file_range_in *fcfri; struct ucred *incred = ap->a_incred; struct ucred *outcred = ap->a_outcred; struct fuse_write_out *fwo; struct thread *td; struct uio io; off_t outfilesize; pid_t pid; int err; if (mp != vnode_mount(outvp)) goto fallback; if (incred->cr_uid != outcred->cr_uid) goto fallback; if (incred->cr_groups[0] != outcred->cr_groups[0]) goto fallback; if (fsess_not_impl(mp, FUSE_COPY_FILE_RANGE)) goto fallback; if (ap->a_fsizetd == NULL) td = curthread; else td = ap->a_fsizetd; pid = td->td_proc->p_pid; /* Lock both vnodes, avoiding risk of deadlock. */ do { err = vn_lock(outvp, LK_EXCLUSIVE); if (invp == outvp) break; if (err == 0) { err = vn_lock(invp, LK_SHARED | LK_NOWAIT); if (err == 0) break; VOP_UNLOCK(outvp); err = vn_lock(invp, LK_SHARED); if (err == 0) VOP_UNLOCK(invp); } } while (err == 0); if (err != 0) return (err); err = fuse_filehandle_getrw(invp, FREAD, &infufh, incred, pid); if (err) goto unlock; err = fuse_filehandle_getrw(outvp, FWRITE, &outfufh, outcred, pid); if (err) goto unlock; if (ap->a_fsizetd) { io.uio_offset = *ap->a_outoffp; io.uio_resid = *ap->a_lenp; err = vn_rlimit_fsize(outvp, &io, ap->a_fsizetd); if (err) goto unlock; } err = fuse_vnode_size(outvp, &outfilesize, outcred, curthread); if (err) goto unlock; err = fuse_inval_buf_range(outvp, outfilesize, *ap->a_outoffp, *ap->a_outoffp + *ap->a_lenp); if (err) goto unlock; fdisp_init(&fdi, sizeof(*fcfri)); fdisp_make_vp(&fdi, FUSE_COPY_FILE_RANGE, invp, td, incred); fcfri = fdi.indata; fcfri->fh_in = infufh->fh_id; fcfri->off_in = *ap->a_inoffp; fcfri->nodeid_out = VTOI(outvp); fcfri->fh_out = outfufh->fh_id; fcfri->off_out = *ap->a_outoffp; fcfri->len = *ap->a_lenp; fcfri->flags = 0; err = fdisp_wait_answ(&fdi); if (err == 0) { fwo = fdi.answ; *ap->a_lenp = fwo->size; *ap->a_inoffp += fwo->size; *ap->a_outoffp += fwo->size; fuse_internal_clear_suid_on_write(outvp, outcred, td); if (*ap->a_outoffp > outfvdat->cached_attrs.va_size) { fuse_vnode_setsize(outvp, *ap->a_outoffp, false); getnanouptime(&outfvdat->last_local_modify); } fuse_vnode_update(invp, FN_ATIMECHANGE); fuse_vnode_update(outvp, FN_MTIMECHANGE | FN_CTIMECHANGE); } fdisp_destroy(&fdi); unlock: if (invp != outvp) VOP_UNLOCK(invp); VOP_UNLOCK(outvp); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_COPY_FILE_RANGE); fallback: err = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp, ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred, ap->a_outcred, ap->a_fsizetd); } return (err); } static void fdisp_make_mknod_for_fallback( struct fuse_dispatcher *fdip, struct componentname *cnp, struct vnode *dvp, uint64_t parentnid, struct thread *td, struct ucred *cred, mode_t mode, enum fuse_opcode *op) { struct fuse_mknod_in *fmni; fdisp_init(fdip, sizeof(*fmni) + cnp->cn_namelen + 1); *op = FUSE_MKNOD; fdisp_make(fdip, *op, vnode_mount(dvp), parentnid, td, cred); fmni = fdip->indata; fmni->mode = mode; fmni->rdev = 0; memcpy((char *)fdip->indata + sizeof(*fmni), cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[sizeof(*fmni) + cnp->cn_namelen] = '\0'; } /* struct vnop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct thread *td = curthread; struct ucred *cred = cnp->cn_cred; struct fuse_data *data; struct fuse_create_in *fci; struct fuse_entry_out *feo; struct fuse_open_out *foo; struct fuse_dispatcher fdi, fdi2; struct fuse_dispatcher *fdip = &fdi; struct fuse_dispatcher *fdip2 = NULL; int err; struct mount *mp = vnode_mount(dvp); data = fuse_get_mpdata(mp); uint64_t parentnid = VTOFUD(dvp)->nid; mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode); enum fuse_opcode op; int flags; if (fuse_isdeadfs(dvp)) return ENXIO; /* FUSE expects sockets to be created with FUSE_MKNOD */ if (vap->va_type == VSOCK) return fuse_internal_mknod(dvp, vpp, cnp, vap); /* * VOP_CREATE doesn't tell us the open(2) flags, so we guess. Only a * writable mode makes sense, and we might as well include readability * too. */ flags = O_RDWR; bzero(&fdi, sizeof(fdi)); if (vap->va_type != VREG) return (EINVAL); if (fsess_not_impl(mp, FUSE_CREATE) || vap->va_type == VSOCK) { /* Fallback to FUSE_MKNOD/FUSE_OPEN */ fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td, cred, mode, &op); } else { /* Use FUSE_CREATE */ size_t insize; op = FUSE_CREATE; fdisp_init(fdip, sizeof(*fci) + cnp->cn_namelen + 1); fdisp_make(fdip, op, vnode_mount(dvp), parentnid, td, cred); fci = fdip->indata; fci->mode = mode; fci->flags = O_CREAT | flags; if (fuse_libabi_geq(data, 7, 12)) { insize = sizeof(*fci); fci->umask = td->td_proc->p_pd->pd_cmask; } else { insize = sizeof(struct fuse_open_in); } memcpy((char *)fdip->indata + insize, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[insize + cnp->cn_namelen] = '\0'; } err = fdisp_wait_answ(fdip); if (err) { if (err == ENOSYS && op == FUSE_CREATE) { fsess_set_notimpl(mp, FUSE_CREATE); fdisp_destroy(fdip); fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td, cred, mode, &op); err = fdisp_wait_answ(fdip); } if (err) goto out; } feo = fdip->answ; if ((err = fuse_internal_checkentry(feo, vap->va_type))) { goto out; } if (op == FUSE_CREATE) { if (fuse_libabi_geq(data, 7, 9)) foo = (struct fuse_open_out*)(feo + 1); else foo = (struct fuse_open_out*)((char*)feo + FUSE_COMPAT_ENTRY_OUT_SIZE); } else { /* Issue a separate FUSE_OPEN */ struct fuse_open_in *foi; fdip2 = &fdi2; fdisp_init(fdip2, sizeof(*foi)); fdisp_make(fdip2, FUSE_OPEN, vnode_mount(dvp), feo->nodeid, td, cred); foi = fdip2->indata; foi->flags = flags; err = fdisp_wait_answ(fdip2); if (err) goto out; foo = fdip2->answ; } err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vap->va_type); if (err) { struct fuse_release_in *fri; uint64_t nodeid = feo->nodeid; uint64_t fh_id = foo->fh; fdisp_init(fdip, sizeof(*fri)); fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred); fri = fdip->indata; fri->fh = fh_id; fri->flags = flags; fuse_insert_callback(fdip->tick, fuse_internal_forget_callback); fuse_insert_message(fdip->tick, false); goto out; } ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create"); fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL, true); fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, td, cred, foo); fuse_vnode_open(*vpp, foo->open_flags, td); /* * Purge the parent's attribute cache because the daemon should've * updated its mtime and ctime */ fuse_vnode_clear_attr_cache(dvp); cache_purge_negative(dvp); out: if (fdip2) fdisp_destroy(fdip2); fdisp_destroy(fdip); return err; } /* struct vnop_fdatasync_args { struct vop_generic_args a_gen; struct vnode * a_vp; struct thread * a_td; }; */ static int fuse_vnop_fdatasync(struct vop_fdatasync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; int waitfor = MNT_WAIT; int err = 0; if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfdatasync_buf(ap))) return err; return fuse_internal_fsync(vp, td, waitfor, true); } /* struct vnop_fsync_args { struct vop_generic_args a_gen; struct vnode * a_vp; int a_waitfor; struct thread * a_td; }; */ static int fuse_vnop_fsync(struct vop_fsync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; int waitfor = ap->a_waitfor; int err = 0; if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfsync(ap))) return err; return fuse_internal_fsync(vp, td, waitfor, false); } /* struct vnop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; int err = 0; int dataflags; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; /* Note that we are not bailing out on a dead file system just yet. */ if (!(dataflags & FSESS_INITED)) { if (!vnode_isvroot(vp)) { fdata_set_dead(fuse_get_mpdata(vnode_mount(vp))); err = ENOTCONN; return err; } else { goto fake; } } err = fuse_internal_getattr(vp, vap, cred, td); if (err == ENOTCONN && vnode_isvroot(vp)) { /* see comment in fuse_vfsop_statfs() */ goto fake; } else { return err; } fake: bzero(vap, sizeof(*vap)); vap->va_type = vnode_vtype(vp); return 0; } /* struct vnop_inactive_args { struct vnode *a_vp; }; */ static int fuse_vnop_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = curthread; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh, *fufh_tmp; int need_flush = 1; LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { if (need_flush && vp->v_type == VREG) { if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, NULL, 0); } if ((fvdat->flag & FN_REVOKED) != 0) fuse_io_invalbuf(vp, td); else fuse_io_flushbuf(vp, MNT_WAIT, td); need_flush = 0; } fuse_filehandle_close(vp, fufh, td, NULL); } if ((fvdat->flag & FN_REVOKED) != 0) vrecycle(vp); return 0; } /* struct vnop_ioctl_args { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_ioctl(struct vop_ioctl_args *ap) { struct vnode *vp = ap->a_vp; struct mount *mp = vnode_mount(vp); struct ucred *cred = ap->a_cred; off_t *offp; pid_t pid = ap->a_td->td_proc->p_pid; int err; switch (ap->a_command) { case FIOSEEKDATA: case FIOSEEKHOLE: /* Call FUSE_LSEEK, if we can, or fall back to vop_stdioctl */ if (fsess_maybe_impl(mp, FUSE_LSEEK)) { int whence; offp = ap->a_data; if (ap->a_command == FIOSEEKDATA) whence = SEEK_DATA; else whence = SEEK_HOLE; vn_lock(vp, LK_SHARED | LK_RETRY); err = fuse_vnop_do_lseek(vp, ap->a_td, cred, pid, offp, whence); VOP_UNLOCK(vp); } if (fsess_not_impl(mp, FUSE_LSEEK)) err = vop_stdioctl(ap); break; default: /* TODO: implement FUSE_IOCTL */ err = ENOTTY; break; } return (err); } /* struct vnop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = VTOVA(vp); struct fuse_dispatcher fdi; struct fuse_entry_out *feo; struct fuse_link_in fli; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_mount(tdvp) != vnode_mount(vp)) { return EXDEV; } /* * This is a seatbelt check to protect naive userspace filesystems from * themselves and the limitations of the FUSE IPC protocol. If a * filesystem does not allow attribute caching, assume it is capable of * validating that nlink does not overflow. */ if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX) return EMLINK; fli.oldnodeid = VTOI(vp); fdisp_init(&fdi, 0); fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp, FUSE_LINK, &fli, sizeof(fli), &fdi); if ((err = fdisp_wait_answ(&fdi))) { goto out; } feo = fdi.answ; if (fli.oldnodeid != feo->nodeid) { struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); fuse_warn(data, FSESS_WARN_ILLEGAL_INODE, "Assigned wrong inode for a hard link."); fuse_vnode_clear_attr_cache(vp); fuse_vnode_clear_attr_cache(tdvp); err = EIO; goto out; } err = fuse_internal_checkentry(feo, vnode_vtype(vp)); if (!err) { /* * Purge the parent's attribute cache because the daemon * should've updated its mtime and ctime */ fuse_vnode_clear_attr_cache(tdvp); fuse_internal_cache_attrs(vp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL, true); } out: fdisp_destroy(&fdi); return err; } struct fuse_lookup_alloc_arg { struct fuse_entry_out *feo; struct componentname *cnp; uint64_t nid; enum vtype vtyp; }; /* Callback for vn_get_ino */ static int fuse_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { struct fuse_lookup_alloc_arg *flaa = arg; return fuse_vnode_get(mp, flaa->feo, flaa->nid, NULL, vpp, flaa->cnp, flaa->vtyp); } SDT_PROBE_DEFINE3(fusefs, , vnops, cache_lookup, "int", "struct timespec*", "struct timespec*"); /* struct vnop_lookup_args { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; */ int fuse_vnop_lookup(struct vop_lookup_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct thread *td = curthread; struct ucred *cred = cnp->cn_cred; struct timespec now; int nameiop = cnp->cn_nameiop; int flags = cnp->cn_flags; - int wantparent = flags & (LOCKPARENT | WANTPARENT); int islastcn = flags & ISLASTCN; struct mount *mp = vnode_mount(dvp); struct fuse_data *data = fuse_get_mpdata(mp); int default_permissions = data->dataflags & FSESS_DEFAULT_PERMISSIONS; bool is_dot; int err = 0; int lookup_err = 0; struct vnode *vp = NULL; struct fuse_dispatcher fdi; bool did_lookup = false; struct fuse_entry_out *feo = NULL; enum vtype vtyp; /* vnode type of target */ uint64_t nid; if (fuse_isdeadfs(dvp)) { *vpp = NULL; return ENXIO; } if (!vnode_isdir(dvp)) return ENOTDIR; if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) return EROFS; if ((cnp->cn_flags & NOEXECCHECK) != 0) cnp->cn_flags &= ~NOEXECCHECK; else if ((err = fuse_internal_access(dvp, VEXEC, td, cred))) return err; is_dot = cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.'; if ((flags & ISDOTDOT) && !(data->dataflags & FSESS_EXPORT_SUPPORT)) { if (!(VTOFUD(dvp)->flag & FN_PARENT_NID)) { /* * Since the file system doesn't support ".." lookups, * we have no way to find this entry. */ return ESTALE; } nid = VTOFUD(dvp)->parent_nid; if (nid == 0) return ENOENT; /* .. is obviously a directory */ vtyp = VDIR; } else if (is_dot) { nid = VTOI(dvp); /* . is obviously a directory */ vtyp = VDIR; } else { struct timespec timeout; int ncpticks; /* here to accommodate for API contract */ err = cache_lookup(dvp, vpp, cnp, &timeout, &ncpticks); getnanouptime(&now); SDT_PROBE3(fusefs, , vnops, cache_lookup, err, &timeout, &now); switch (err) { case -1: /* positive match */ if (timespeccmp(&timeout, &now, >)) { counter_u64_add(fuse_lookup_cache_hits, 1); } else { /* Cache timeout */ counter_u64_add(fuse_lookup_cache_misses, 1); bintime_clear( &VTOFUD(*vpp)->entry_cache_timeout); cache_purge(*vpp); if (dvp != *vpp) vput(*vpp); else vrele(*vpp); *vpp = NULL; break; } return 0; case 0: /* no match in cache */ counter_u64_add(fuse_lookup_cache_misses, 1); break; case ENOENT: /* negative match */ if (timespeccmp(&timeout, &now, <=)) { /* Cache timeout */ cache_purge_negative(dvp); break; } /* fall through */ default: return err; } fdisp_init(&fdi, cnp->cn_namelen + 1); fdisp_make(&fdi, FUSE_LOOKUP, mp, VTOI(dvp), td, cred); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; lookup_err = fdisp_wait_answ(&fdi); did_lookup = true; if (!lookup_err) { /* lookup call succeeded */ feo = (struct fuse_entry_out *)fdi.answ; nid = feo->nodeid; if (nid == 0) { /* zero nodeid means ENOENT and cache it */ struct timespec timeout; fdi.answ_stat = ENOENT; lookup_err = ENOENT; if (cnp->cn_flags & MAKEENTRY) { fuse_validity_2_timespec(feo, &timeout); /* Use the same entry_time for .. as for * the file itself. That doesn't honor * exactly what the fuse server tells * us, but to do otherwise would require * another cache lookup at this point. */ struct timespec *dtsp = NULL; cache_enter_time(dvp, *vpp, cnp, &timeout, dtsp); } } vtyp = IFTOVT(feo->attr.mode); } if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT)) { fdisp_destroy(&fdi); return lookup_err; } } /* lookup_err, if non-zero, must be ENOENT at this point */ if (lookup_err) { /* Entry not found */ if ((nameiop == CREATE || nameiop == RENAME) && islastcn) { if (default_permissions) err = fuse_internal_access(dvp, VWRITE, td, cred); else err = 0; if (!err) { - /* - * Set the SAVENAME flag to hold onto the - * pathname for use later in VOP_CREATE or - * VOP_RENAME. - */ - cnp->cn_flags |= SAVENAME; - err = EJUSTRETURN; } } else { err = ENOENT; } } else { /* Entry was found */ if (flags & ISDOTDOT) { struct fuse_lookup_alloc_arg flaa; flaa.nid = nid; flaa.feo = feo; flaa.cnp = cnp; flaa.vtyp = vtyp; err = vn_vget_ino_gen(dvp, fuse_lookup_alloc, &flaa, 0, &vp); *vpp = vp; } else if (nid == VTOI(dvp)) { if (is_dot) { vref(dvp); *vpp = dvp; } else { fuse_warn(fuse_get_mpdata(mp), FSESS_WARN_ILLEGAL_INODE, "Assigned same inode to both parent and " "child."); err = EIO; } } else { struct fuse_vnode_data *fvdat; err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp, &vp, cnp, vtyp); if (err) goto out; *vpp = vp; fvdat = VTOFUD(vp); MPASS(feo != NULL); if (timespeccmp(&now, &fvdat->last_local_modify, >)) { /* * Attributes from the server are definitely * newer than the last attributes we sent to * the server, so cache them. */ fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL, true); } fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec, &fvdat->entry_cache_timeout); if ((nameiop == DELETE || nameiop == RENAME) && islastcn && default_permissions) { struct vattr dvattr; err = fuse_internal_access(dvp, VWRITE, td, cred); if (err != 0) goto out; /* * if the parent's sticky bit is set, check * whether we're allowed to remove the file. * Need to figure out the vnode locking to make * this work. */ fuse_internal_getattr(dvp, &dvattr, cred, td); if ((dvattr.va_mode & S_ISTXT) && fuse_internal_access(dvp, VADMIN, td, cred) && fuse_internal_access(*vpp, VADMIN, td, cred)) { err = EPERM; goto out; } } - - if (islastcn && ( - (nameiop == DELETE) || - (nameiop == RENAME && wantparent))) { - cnp->cn_flags |= SAVENAME; - } } } out: if (err) { if (vp != NULL && dvp != vp) vput(vp); else if (vp != NULL) vrele(vp); *vpp = NULL; } if (did_lookup) fdisp_destroy(&fdi); return err; } /* struct vnop_mkdir_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct fuse_mkdir_in fmdi; if (fuse_isdeadfs(dvp)) { return ENXIO; } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); fmdi.umask = curthread->td_proc->p_pd->pd_cmask; return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi, sizeof(fmdi), VDIR)); } /* struct vnop_mknod_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mknod(struct vop_mknod_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; if (fuse_isdeadfs(dvp)) return ENXIO; return fuse_internal_mknod(dvp, vpp, cnp, vap); } /* struct vop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; / struct file *a_fp; }; */ static int fuse_vnop_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; int a_mode = ap->a_mode; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; pid_t pid = td->td_proc->p_pid; if (fuse_isdeadfs(vp)) return ENXIO; if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO) return (EOPNOTSUPP); if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0) return EINVAL; if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) { fuse_vnode_open(vp, 0, td); return 0; } return fuse_filehandle_open(vp, a_mode, NULL, td, cred); } static int fuse_vnop_pathconf(struct vop_pathconf_args *ap) { struct vnode *vp = ap->a_vp; struct mount *mp; switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = MIN(LONG_MAX, FUSE_LINK_MAX); return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); case _PC_MIN_HOLE_SIZE: /* * The FUSE protocol provides no mechanism for a server to * report _PC_MIN_HOLE_SIZE. It's a protocol bug. Instead, * return EINVAL if the server does not support FUSE_LSEEK, or * 1 if it does. */ mp = vnode_mount(vp); if (!fsess_is_impl(mp, FUSE_LSEEK) && !fsess_not_impl(mp, FUSE_LSEEK)) { off_t offset = 0; /* Issue a FUSE_LSEEK to find out if it's implemented */ fuse_vnop_do_lseek(vp, curthread, curthread->td_ucred, curthread->td_proc->p_pid, &offset, SEEK_DATA); } if (fsess_is_impl(mp, FUSE_LSEEK)) { *ap->a_retval = 1; return (0); } else { /* * Probably FUSE_LSEEK is not implemented. It might * be, if the FUSE_LSEEK above returned an error like * EACCES, but in that case we can't tell, so it's * safest to report EINVAL anyway. */ return (EINVAL); } default: return (vop_stdpathconf(ap)); } } SDT_PROBE_DEFINE3(fusefs, , vnops, filehandles_closed, "struct vnode*", "struct uio*", "struct ucred*"); /* struct vnop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; struct fuse_filehandle *fufh; int err; bool closefufh = false, directio; MPASS(vp->v_type == VREG || vp->v_type == VDIR); if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } err = fuse_filehandle_getrw(vp, FREAD, &fufh, cred, pid); if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { /* * nfsd will do I/O without first doing VOP_OPEN. We * must implicitly open the file here */ err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred); closefufh = true; } if (err) { SDT_PROBE3(fusefs, , vnops, filehandles_closed, vp, uio, cred); return err; } /* * Ideally, when the daemon asks for direct io at open time, the * standard file flag should be set according to this, so that would * just change the default mode, which later on could be changed via * fcntl(2). * But this doesn't work, the O_DIRECT flag gets cleared at some point * (don't know where). So to make any use of the Fuse direct_io option, * we hardwire it into the file's private data (similarly to Linux, * btw.). */ directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); fuse_vnode_update(vp, FN_ATIMECHANGE); if (directio) { SDT_PROBE2(fusefs, , vnops, trace, 1, "direct read of vnode"); err = fuse_read_directbackend(vp, uio, cred, fufh); } else { SDT_PROBE2(fusefs, , vnops, trace, 1, "buffered read of vnode"); err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh, pid); } if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); return (err); } /* struct vnop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; uint64_t **a_cookies; }; */ static int fuse_vnop_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_filehandle *fufh = NULL; struct mount *mp = vnode_mount(vp); struct fuse_iov cookediov; int err = 0; uint64_t *cookies; ssize_t tresid; int ncookies; bool closefufh = false; pid_t pid = curthread->td_proc->p_pid; if (ap->a_eofflag) *ap->a_eofflag = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } if ( /* XXXIP ((uio_iovcnt(uio) > 1)) || */ (uio_resid(uio) < sizeof(struct dirent))) { return EINVAL; } tresid = uio->uio_resid; err = fuse_filehandle_get_dir(vp, &fufh, cred, pid); if (err == EBADF && mp->mnt_flag & MNT_EXPORTED) { KASSERT(fuse_get_mpdata(mp)->dataflags & FSESS_NO_OPENDIR_SUPPORT, ("FUSE file systems that don't set " "FUSE_NO_OPENDIR_SUPPORT should not be exported")); /* * nfsd will do VOP_READDIR without first doing VOP_OPEN. We * must implicitly open the directory here. */ err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred); closefufh = true; } if (err) return (err); if (ap->a_ncookies != NULL) { ncookies = uio->uio_resid / (offsetof(struct dirent, d_name) + 4) + 1; cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } else { ncookies = 0; cookies = NULL; } #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1) fiov_init(&cookediov, DIRCOOKEDSIZE); err = fuse_internal_readdir(vp, uio, fufh, &cookediov, &ncookies, cookies); fiov_teardown(&cookediov); if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); if (ap->a_ncookies != NULL) { if (err == 0) { *ap->a_ncookies -= ncookies; } else { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } } if (err == 0 && tresid == uio->uio_resid) *ap->a_eofflag = 1; return err; } /* struct vnop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; }; */ static int fuse_vnop_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_dispatcher fdi; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (!vnode_islnk(vp)) { return EINVAL; } fdisp_init(&fdi, 0); err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred); if (err) { goto out; } if (((char *)fdi.answ)[0] == '/' && fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) { char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname; err = uiomove(mpth, strlen(mpth), uio); } if (!err) { err = uiomove(fdi.answ, fdi.iosize, uio); } out: fdisp_destroy(&fdi); return err; } /* struct vnop_reclaim_args { struct vnode *a_vp; }; */ static int fuse_vnop_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = curthread; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh, *fufh_tmp; if (!fvdat) { panic("FUSE: no vnode data during recycling"); } LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { printf("FUSE: vnode being reclaimed with open fufh " "(type=%#x)", fufh->fufh_type); fuse_filehandle_close(vp, fufh, td, NULL); } if (VTOI(vp) == 1) { /* * Don't send FUSE_FORGET for the root inode, because * we never send FUSE_LOOKUP for it (see * fuse_vfsop_root) and we don't want the server to see * mismatched lookup counts. */ struct fuse_data *data; struct vnode *vroot; data = fuse_get_mpdata(vnode_mount(vp)); FUSE_LOCK(); vroot = data->vroot; data->vroot = NULL; FUSE_UNLOCK(); if (vroot) vrele(vroot); } else if (!fuse_isdeadfs(vp) && fvdat->nlookup > 0) { fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp), fvdat->nlookup); } cache_purge(vp); vfs_hash_remove(vp); fuse_vnode_destroy(vp); return 0; } /* struct vnop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_isdir(vp)) { return EPERM; } err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); return err; } /* struct vnop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; }; */ static int fuse_vnop_rename(struct vop_rename_args *ap) { struct vnode *fdvp = ap->a_fdvp; struct vnode *fvp = ap->a_fvp; struct componentname *fcnp = ap->a_fcnp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct fuse_data *data; bool newparent = fdvp != tdvp; bool isdir = fvp->v_type == VDIR; int err = 0; if (fuse_isdeadfs(fdvp)) { return ENXIO; } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename"); err = EXDEV; goto out; } cache_purge(fvp); /* * FUSE library is expected to check if target directory is not * under the source directory in the file system tree. * Linux performs this check at VFS level. */ /* * If source is a directory, and it will get a new parent, user must * have write permission to it, so ".." can be modified. */ data = fuse_get_mpdata(vnode_mount(tdvp)); if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) { err = fuse_internal_access(fvp, VWRITE, curthread, tcnp->cn_cred); if (err) goto out; } sx_xlock(&data->rename_lock); err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp); if (err == 0) { if (tdvp != fdvp) fuse_vnode_setparent(fvp, tdvp); if (tvp != NULL) fuse_vnode_setparent(tvp, NULL); } sx_unlock(&data->rename_lock); if (tvp != NULL && tvp != fvp) { cache_purge(tvp); } if (vnode_isdir(fvp)) { if (((tvp != NULL) && vnode_isdir(tvp)) || vnode_isdir(fvp)) { cache_purge(tdvp); } cache_purge(fdvp); } out: if (tdvp == tvp) { vrele(tdvp); } else { vput(tdvp); } if (tvp != NULL) { vput(tvp); } vrele(fdvp); vrele(fvp); return err; } /* struct vnop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } *ap; */ static int fuse_vnop_rmdir(struct vop_rmdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp) == VTOFUD(dvp)) { return EINVAL; } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); return err; } /* struct vnop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; struct mount *mp; struct fuse_data *data; struct vattr old_va; int dataflags; int err = 0, err2; accmode_t accmode = 0; bool checkperm; bool drop_suid = false; gid_t cr_gid; mp = vnode_mount(vp); data = fuse_get_mpdata(mp); dataflags = data->dataflags; checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS; if (cred->cr_ngroups > 0) cr_gid = cred->cr_groups[0]; else cr_gid = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vap->va_uid != (uid_t)VNOVAL) { if (checkperm) { /* Only root may change a file's owner */ err = priv_check_cred(cred, PRIV_VFS_CHOWN); if (err) { /* As a special case, allow the null chown */ err2 = fuse_internal_getattr(vp, &old_va, cred, td); if (err2) return (err2); if (vap->va_uid != old_va.va_uid) return err; else accmode |= VADMIN; drop_suid = true; } else accmode |= VADMIN; } else accmode |= VADMIN; } if (vap->va_gid != (gid_t)VNOVAL) { if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN)) drop_suid = true; if (checkperm && !groupmember(vap->va_gid, cred)) { /* * Non-root users may only chgrp to one of their own * groups */ err = priv_check_cred(cred, PRIV_VFS_CHOWN); if (err) { /* As a special case, allow the null chgrp */ err2 = fuse_internal_getattr(vp, &old_va, cred, td); if (err2) return (err2); if (vap->va_gid != old_va.va_gid) return err; accmode |= VADMIN; } else accmode |= VADMIN; } else accmode |= VADMIN; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vfs_isrdonly(mp)) return (EROFS); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support shared memory objects in the file * system, and have dubious support for truncating * symlinks. Just ignore the request in other cases. */ return (0); } /* Don't set accmode. Permission to trunc is checked upstack */ } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vap->va_vaflags & VA_UTIMES_NULL) accmode |= VWRITE; else accmode |= VADMIN; } if (drop_suid) { if (vap->va_mode != (mode_t)VNOVAL) vap->va_mode &= ~(S_ISUID | S_ISGID); else { err = fuse_internal_getattr(vp, &old_va, cred, td); if (err) return (err); vap->va_mode = old_va.va_mode & ~(S_ISUID | S_ISGID); } } if (vap->va_mode != (mode_t)VNOVAL) { /* Only root may set the sticky bit on non-directories */ if (checkperm && vp->v_type != VDIR && (vap->va_mode & S_ISTXT) && priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return EFTYPE; if (checkperm && (vap->va_mode & S_ISGID)) { err = fuse_internal_getattr(vp, &old_va, cred, td); if (err) return (err); if (!groupmember(old_va.va_gid, cred)) { err = priv_check_cred(cred, PRIV_VFS_SETGID); if (err) return (err); } } accmode |= VADMIN; } if (vfs_isrdonly(mp)) return EROFS; if (checkperm) { err = fuse_internal_access(vp, accmode, td, cred); } else { err = 0; } if (err) return err; else return fuse_internal_setattr(vp, vap, td, cred); } /* struct vnop_strategy_args { struct vnode *a_vp; struct buf *a_bp; }; */ static int fuse_vnop_strategy(struct vop_strategy_args *ap) { struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; if (!vp || fuse_isdeadfs(vp)) { bp->b_ioflags |= BIO_ERROR; bp->b_error = ENXIO; bufdone(bp); return 0; } /* * VOP_STRATEGY always returns zero and signals error via bp->b_ioflags. * fuse_io_strategy sets bp's error fields */ (void)fuse_io_strategy(vp, bp); return 0; } /* struct vnop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; }; */ static int fuse_vnop_symlink(struct vop_symlink_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; const char *target = ap->a_target; struct fuse_dispatcher fdi; int err; size_t len; if (fuse_isdeadfs(dvp)) { return ENXIO; } /* * Unlike the other creator type calls, here we have to create a message * where the name of the new entry comes first, and the data describing * the entry comes second. * Hence we can't rely on our handy fuse_internal_newentry() routine, * but put together the message manually and just call the core part. */ len = strlen(target) + 1; fdisp_init(&fdi, len + cnp->cn_namelen + 1); fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len); err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi); fdisp_destroy(&fdi); return err; } /* struct vnop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_write(struct vop_write_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; struct fuse_filehandle *fufh; int err; bool closefufh = false, directio; MPASS(vp->v_type == VREG || vp->v_type == VDIR); if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } err = fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { /* * nfsd will do I/O without first doing VOP_OPEN. We * must implicitly open the file here */ err = fuse_filehandle_open(vp, FWRITE, &fufh, curthread, cred); closefufh = true; } if (err) { SDT_PROBE3(fusefs, , vnops, filehandles_closed, vp, uio, cred); return err; } /* * Ideally, when the daemon asks for direct io at open time, the * standard file flag should be set according to this, so that would * just change the default mode, which later on could be changed via * fcntl(2). * But this doesn't work, the O_DIRECT flag gets cleared at some point * (don't know where). So to make any use of the Fuse direct_io option, * we hardwire it into the file's private data (similarly to Linux, * btw.). */ directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); if (directio) { off_t start, end, filesize; bool pages = (ioflag & IO_VMIO) != 0; SDT_PROBE2(fusefs, , vnops, trace, 1, "direct write of vnode"); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) goto out; start = uio->uio_offset; end = start + uio->uio_resid; if (!pages) { err = fuse_inval_buf_range(vp, filesize, start, end); if (err) goto out; } err = fuse_write_directbackend(vp, uio, cred, fufh, filesize, ioflag, pages); } else { SDT_PROBE2(fusefs, , vnops, trace, 1, "buffered write of vnode"); if (!fsess_opt_writeback(vnode_mount(vp))) ioflag |= IO_SYNC; err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag, pid); } fuse_internal_clear_suid_on_write(vp, cred, uio->uio_td); out: if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); return (err); } static daddr_t fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { const int biosize = fuse_iosize(vp); return (off / biosize); } static int fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *blksz) { off_t filesize; int err; const int biosize = fuse_iosize(vp); err = fuse_vnode_size(vp, &filesize, NULL, NULL); if (err) { /* This will turn into a SIGBUS */ return (EIO); } else if ((off_t)lbn * biosize >= filesize) { *blksz = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { *blksz = filesize - (off_t)lbn *biosize; } else { *blksz = biosize; } return (0); } /* struct vnop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; }; */ static int fuse_vnop_getpages(struct vop_getpages_args *ap) { struct vnode *vp = ap->a_vp; if (!fsess_opt_mmap(vnode_mount(vp))) { SDT_PROBE2(fusefs, , vnops, trace, 1, "called on non-cacheable vnode??\n"); return (VM_PAGER_ERROR); } return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz)); } static const char extattr_namespace_separator = '.'; /* struct vop_getextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getextattr(struct vop_getextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_getxattr_in *get_xattr_in; struct fuse_getxattr_out *get_xattr_out; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; char *attr_str; size_t len; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (fsess_not_impl(mp, FUSE_GETXATTR)) return EOPNOTSUPP; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len + sizeof(*get_xattr_in)); fdisp_make_vp(&fdi, FUSE_GETXATTR, vp, td, cred); get_xattr_in = fdi.indata; /* * Check to see whether we're querying the available size or * issuing the actual request. If we pass in 0, we get back struct * fuse_getxattr_out. If we pass in a non-zero size, we get back * that much data, without the struct fuse_getxattr_out header. */ if (uio == NULL) get_xattr_in->size = 0; else get_xattr_in->size = uio->uio_resid; attr_str = (char *)fdi.indata + sizeof(*get_xattr_in); snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_GETXATTR); err = EOPNOTSUPP; } goto out; } get_xattr_out = fdi.answ; if (ap->a_size != NULL) *ap->a_size = get_xattr_out->size; if (uio != NULL) err = uiomove(fdi.answ, fdi.iosize, uio); out: fdisp_destroy(&fdi); return (err); } /* struct vop_setextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setextattr(struct vop_setextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_setxattr_in *set_xattr_in; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (fsess_not_impl(mp, FUSE_SETXATTR)) return EOPNOTSUPP; if (vfs_isrdonly(mp)) return EROFS; /* Deleting xattrs must use VOP_DELETEEXTATTR instead */ if (ap->a_uio == NULL) { /* * If we got here as fallback from VOP_DELETEEXTATTR, then * return EOPNOTSUPP. */ if (fsess_not_impl(mp, FUSE_REMOVEXATTR)) return (EOPNOTSUPP); else return (EINVAL); } err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VWRITE); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len + sizeof(*set_xattr_in) + uio->uio_resid); fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, td, cred); set_xattr_in = fdi.indata; set_xattr_in->size = uio->uio_resid; attr_str = (char *)fdi.indata + sizeof(*set_xattr_in); snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = uiomove((char *)fdi.indata + sizeof(*set_xattr_in) + len, uio->uio_resid, uio); if (err != 0) { goto out; } err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_SETXATTR); err = EOPNOTSUPP; } if (err == ERESTART) { /* Can't restart after calling uiomove */ err = EINTR; } out: fdisp_destroy(&fdi); return (err); } /* * The Linux / FUSE extended attribute list is simply a collection of * NUL-terminated strings. The FreeBSD extended attribute list is a single * byte length followed by a non-NUL terminated string. So, this allows * conversion of the Linux / FUSE format to the FreeBSD format in place. * Linux attribute names are reported with the namespace as a prefix (e.g. * "user.attribute_name"), but in FreeBSD they are reported without the * namespace prefix (e.g. "attribute_name"). So, we're going from: * * user.attr_name1\0user.attr_name2\0 * * to: * * attr_name1attr_name2 * * Where "" is a single byte number of characters in the attribute name. * * Args: * prefix - exattr namespace prefix string * list, list_len - input list with namespace prefixes * bsd_list, bsd_list_len - output list compatible with bsd vfs */ static int fuse_xattrlist_convert(char *prefix, const char *list, int list_len, char *bsd_list, int *bsd_list_len) { int len, pos, dist_to_next, prefix_len; pos = 0; *bsd_list_len = 0; prefix_len = strlen(prefix); while (pos < list_len && list[pos] != '\0') { dist_to_next = strlen(&list[pos]) + 1; if (bcmp(&list[pos], prefix, prefix_len) == 0 && list[pos + prefix_len] == extattr_namespace_separator) { len = dist_to_next - (prefix_len + sizeof(extattr_namespace_separator)) - 1; if (len >= EXTATTR_MAXNAMELEN) return (ENAMETOOLONG); bsd_list[*bsd_list_len] = len; memcpy(&bsd_list[*bsd_list_len + 1], &list[pos + prefix_len + sizeof(extattr_namespace_separator)], len); *bsd_list_len += len + 1; } pos += dist_to_next; } return (0); } /* * List extended attributes * * The FUSE_LISTXATTR operation is based on Linux's listxattr(2) syscall, which * has a number of differences compared to its FreeBSD equivalent, * extattr_list_file: * * - FUSE_LISTXATTR returns all extended attributes across all namespaces, * whereas listxattr(2) only returns attributes for a single namespace * - FUSE_LISTXATTR prepends each attribute name with "namespace." * - If the provided buffer is not large enough to hold the result, * FUSE_LISTXATTR should return ERANGE, whereas listxattr is expected to * return as many results as will fit. */ /* struct vop_listextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_listextattr(struct vop_listextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_listxattr_in *list_xattr_in; struct fuse_listxattr_out *list_xattr_out; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; char *bsd_list = NULL; char *linux_list; int bsd_list_len; int linux_list_len; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (fsess_not_impl(mp, FUSE_LISTXATTR)) return EOPNOTSUPP; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) return err; /* * Add space for a NUL and the period separator if enabled. * Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; fdisp_init(&fdi, sizeof(*list_xattr_in)); fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); /* * Retrieve Linux / FUSE compatible list size. */ list_xattr_in = fdi.indata; list_xattr_in->size = 0; err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LISTXATTR); err = EOPNOTSUPP; } goto out; } list_xattr_out = fdi.answ; linux_list_len = list_xattr_out->size; if (linux_list_len == 0) { if (ap->a_size != NULL) *ap->a_size = linux_list_len; goto out; } /* * Retrieve Linux / FUSE compatible list values. */ fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); list_xattr_in = fdi.indata; list_xattr_in->size = linux_list_len; err = fdisp_wait_answ(&fdi); if (err == ERANGE) { /* * Race detected. The attribute list must've grown since the * first FUSE_LISTXATTR call. Start over. Go all the way back * to userland so we can process signals, if necessary, before * restarting. */ err = ERESTART; goto out; } else if (err != 0) goto out; linux_list = fdi.answ; /* FUSE doesn't allow the server to return more data than requested */ if (fdi.iosize > linux_list_len) { struct fuse_data *data = fuse_get_mpdata(mp); fuse_warn(data, FSESS_WARN_LSEXTATTR_LONG, "server returned " "more extended attribute data than requested; " "should've returned ERANGE instead."); } else { /* But returning less data is fine */ linux_list_len = fdi.iosize; } /* * Retrieve the BSD compatible list values. * The Linux / FUSE attribute list format isn't the same * as FreeBSD's format. So we need to transform it into * FreeBSD's format before giving it to the user. */ bsd_list = malloc(linux_list_len, M_TEMP, M_WAITOK); err = fuse_xattrlist_convert(prefix, linux_list, linux_list_len, bsd_list, &bsd_list_len); if (err != 0) goto out; if (ap->a_size != NULL) *ap->a_size = bsd_list_len; if (uio != NULL) err = uiomove(bsd_list, bsd_list_len, uio); out: free(bsd_list, M_TEMP); fdisp_destroy(&fdi); return (err); } /* struct vop_deallocate_args { struct vop_generic_args a_gen; struct vnode *a_vp; off_t *a_offset; off_t *a_len; int a_flags; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_deallocate(struct vop_deallocate_args *ap) { struct vnode *vp = ap->a_vp; struct mount *mp = vnode_mount(vp); struct fuse_filehandle *fufh; struct fuse_dispatcher fdi; struct fuse_fallocate_in *ffi; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; off_t *len = ap->a_len; off_t *offset = ap->a_offset; int ioflag = ap->a_ioflag; off_t filesize; int err; bool closefufh = false; if (fuse_isdeadfs(vp)) return (ENXIO); if (vfs_isrdonly(mp)) return (EROFS); if (fsess_not_impl(mp, FUSE_FALLOCATE)) goto fallback; err = fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { /* * nfsd will do I/O without first doing VOP_OPEN. We * must implicitly open the file here */ err = fuse_filehandle_open(vp, FWRITE, &fufh, curthread, cred); closefufh = true; } if (err) return (err); fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) goto out; fuse_inval_buf_range(vp, filesize, *offset, *offset + *len); fdisp_init(&fdi, sizeof(*ffi)); fdisp_make_vp(&fdi, FUSE_FALLOCATE, vp, curthread, cred); ffi = fdi.indata; ffi->fh = fufh->fh_id; ffi->offset = *offset; ffi->length = *len; /* * FreeBSD's fspacectl is equivalent to Linux's fallocate with * mode == FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE */ ffi->mode = FUSE_FALLOC_FL_PUNCH_HOLE | FUSE_FALLOC_FL_KEEP_SIZE; err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_FALLOCATE); goto fallback; } else if (err == EOPNOTSUPP) { /* * The file system server does not support FUSE_FALLOCATE with * the supplied mode for this particular file. */ goto fallback; } else if (!err) { /* * Clip the returned offset to EoF. Do it here rather than * before FUSE_FALLOCATE just in case the kernel's cached file * size is out of date. Unfortunately, FUSE does not return * any information about filesize from that operation. */ *offset = MIN(*offset + *len, filesize); *len = 0; fuse_vnode_undirty_cached_timestamps(vp, false); fuse_internal_clear_suid_on_write(vp, cred, curthread); if (ioflag & IO_SYNC) err = fuse_internal_fsync(vp, curthread, MNT_WAIT, false); } out: if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); return (err); fallback: if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); return (vop_stddeallocate(ap)); } /* struct vop_deleteextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap) { struct vnode *vp = ap->a_vp; struct fuse_dispatcher fdi; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (fsess_not_impl(mp, FUSE_REMOVEXATTR)) return EOPNOTSUPP; if (vfs_isrdonly(mp)) return EROFS; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VWRITE); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len); fdisp_make_vp(&fdi, FUSE_REMOVEXATTR, vp, td, cred); attr_str = fdi.indata; snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_REMOVEXATTR); err = EOPNOTSUPP; } fdisp_destroy(&fdi); return (err); } /* struct vnop_print_args { struct vnode *a_vp; }; */ static int fuse_vnop_print(struct vop_print_args *ap) { struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp); printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n", (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid, (uintmax_t)fvdat->nlookup, fvdat->flag); return 0; } /* * Get an NFS filehandle for a FUSE file. * * This will only work for FUSE file systems that guarantee the uniqueness of * nodeid:generation, which most don't. */ /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ static int fuse_vnop_vptofh(struct vop_vptofh_args *ap) { struct vnode *vp = ap->a_vp; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_fid *fhp = (struct fuse_fid *)(ap->a_fhp); _Static_assert(sizeof(struct fuse_fid) <= sizeof(struct fid), "FUSE fid type is too big"); struct mount *mp = vnode_mount(vp); struct fuse_data *data = fuse_get_mpdata(mp); struct vattr va; int err; if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) { /* NFS requires lookups for "." and ".." */ SDT_PROBE2(fusefs, , vnops, trace, 1, "VOP_VPTOFH without FUSE_EXPORT_SUPPORT"); return EOPNOTSUPP; } if ((mp->mnt_flag & MNT_EXPORTED) && !(data->dataflags & FSESS_NO_OPENDIR_SUPPORT)) { /* * NFS is stateless, so nfsd must reopen a directory on every * call to VOP_READDIR, passing in the d_off field from the * final dirent of the previous invocation. But without * FUSE_NO_OPENDIR_SUPPORT, the FUSE protocol does not * guarantee that d_off will be valid after a directory is * closed and reopened. So prohibit exporting FUSE file * systems that don't set that flag. * * But userspace NFS servers don't have this problem. */ SDT_PROBE2(fusefs, , vnops, trace, 1, "VOP_VPTOFH without FUSE_NO_OPENDIR_SUPPORT"); return EOPNOTSUPP; } err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread); if (err) return err; /*ip = VTOI(ap->a_vp);*/ /*ufhp = (struct ufid *)ap->a_fhp;*/ fhp->len = sizeof(struct fuse_fid); fhp->nid = fvdat->nid; if (fvdat->generation <= UINT32_MAX) fhp->gen = fvdat->generation; else return EOVERFLOW; return (0); } diff --git a/sys/fs/msdosfs/msdosfs_lookup.c b/sys/fs/msdosfs/msdosfs_lookup.c index 91b778b8173b..c061a9169f25 100644 --- a/sys/fs/msdosfs/msdosfs_lookup.c +++ b/sys/fs/msdosfs/msdosfs_lookup.c @@ -1,1100 +1,1096 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_lookup.c,v 1.37 1997/11/17 15:36:54 ws Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include static int msdosfs_lookup_checker(struct msdosfsmount *pmp, struct vnode *dvp, struct denode *tdp, struct vnode **vpp) { struct vnode *vp; vp = DETOV(tdp); /* * Lookup assumes that directory cannot be hardlinked. * Corrupted msdosfs filesystem could break this assumption. */ if (vp == dvp) { vput(vp); msdosfs_integrity_error(pmp); *vpp = NULL; return (EBADF); } *vpp = vp; return (0); } int msdosfs_lookup(struct vop_cachedlookup_args *ap) { return (msdosfs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL, NULL)); } struct deget_dotdot { u_long cluster; int blkoff; }; static int msdosfs_deget_dotdot(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { struct deget_dotdot *dd_arg; struct denode *rdp; struct msdosfsmount *pmp; int error; pmp = VFSTOMSDOSFS(mp); dd_arg = arg; error = deget(pmp, dd_arg->cluster, dd_arg->blkoff, LK_EXCLUSIVE, &rdp); if (error == 0) *rvp = DETOV(rdp); return (error); } /* * When we search a directory the blocks containing directory entries are * read and examined. The directory entries contain information that would * normally be in the inode of a unix filesystem. This means that some of * a directory's contents may also be in memory resident denodes (sort of * an inode). This can cause problems if we are searching while some other * process is modifying a directory. To prevent one process from accessing * incompletely modified directory information we depend upon being the * sole owner of a directory block. bread/brelse provide this service. * This being the case, when a process modifies a directory it must first * acquire the disk block that contains the directory entry to be modified. * Then update the disk block and the denode, and then write the disk block * out to disk. This way disk blocks containing directory entries and in * memory denode's will be in synch. */ int msdosfs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, daddr_t *scnp, u_long *blkoffp) { struct mbnambuf nb; daddr_t bn; int error; int slotcount; int slotoffset = 0; int frcn; u_long cluster; u_long blkoff; int diroff; int blsize; int isadir; /* ~0 if found direntry is a directory */ daddr_t scn; /* starting cluster number */ struct vnode *pdp; struct denode *dp; struct denode *tdp; struct msdosfsmount *pmp; struct buf *bp = NULL; struct direntry *dep = NULL; struct deget_dotdot dd_arg; u_char dosfilename[12]; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; int unlen; uint64_t inode1; int wincnt = 1; int chksum = -1, chksum_ok; int olddos = 1; #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): looking for %s\n", cnp->cn_nameptr); #endif dp = VTODE(vdp); pmp = dp->de_pmp; #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): vdp %p, dp %p, Attr %02x\n", vdp, dp, dp->de_Attributes); #endif restart: if (vpp != NULL) *vpp = NULL; /* * If they are going after the . or .. entry in the root directory, * they won't find it. DOS filesystems don't have them in the root * directory. So, we fake it. deget() is in on this scam too. */ if ((vdp->v_vflag & VV_ROOT) && cnp->cn_nameptr[0] == '.' && (cnp->cn_namelen == 1 || (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.'))) { isadir = ATTR_DIRECTORY; scn = MSDOSFSROOT; #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): looking for . or .. in root directory\n"); #endif cluster = MSDOSFSROOT; blkoff = MSDOSFSROOT_OFS; goto foundroot; } switch (unix2dosfn((const u_char *)cnp->cn_nameptr, dosfilename, cnp->cn_namelen, 0, pmp)) { case 0: return (EINVAL); case 1: break; case 2: wincnt = winSlotCnt((const u_char *)cnp->cn_nameptr, cnp->cn_namelen, pmp) + 1; break; case 3: olddos = 0; wincnt = winSlotCnt((const u_char *)cnp->cn_nameptr, cnp->cn_namelen, pmp) + 1; break; } if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) { wincnt = 1; olddos = 1; } unlen = winLenFixup(cnp->cn_nameptr, cnp->cn_namelen); /* * Suppress search for slots unless creating * file and at end of pathname, in which case * we watch for a place to put the new file in * case it doesn't already exist. */ slotcount = wincnt; if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) slotcount = 0; #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): dos version of filename %s, length %ld\n", dosfilename, cnp->cn_namelen); #endif /* * Search the directory pointed at by vdp for the name pointed at * by cnp->cn_nameptr. */ tdp = NULL; mbnambuf_init(&nb); /* * The outer loop ranges over the clusters that make up the * directory. Note that the root directory is different from all * other directories. It has a fixed number of blocks that are not * part of the pool of allocatable clusters. So, we treat it a * little differently. The root directory starts at "cluster" 0. */ diroff = 0; for (frcn = 0;; frcn++) { error = pcbmap(dp, frcn, &bn, &cluster, &blsize); if (error) { if (error == E2BIG) break; return (error); } error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { return (error); } for (blkoff = 0; blkoff < blsize; blkoff += sizeof(struct direntry), diroff += sizeof(struct direntry)) { dep = (struct direntry *)(bp->b_data + blkoff); /* * If the slot is empty and we are still looking * for an empty then remember this one. If the * slot is not empty then check to see if it * matches what we are looking for. If the slot * has never been filled with anything, then the * remainder of the directory has never been used, * so there is no point in searching it. */ if (dep->deName[0] == SLOT_EMPTY || dep->deName[0] == SLOT_DELETED) { /* * Drop memory of previous long matches */ chksum = -1; mbnambuf_init(&nb); if (slotcount < wincnt) { slotcount++; slotoffset = diroff; } if (dep->deName[0] == SLOT_EMPTY) { brelse(bp); goto notfound; } } else { /* * If there wasn't enough space for our winentries, * forget about the empty space */ if (slotcount < wincnt) slotcount = 0; /* * Check for Win95 long filename entry */ if (dep->deAttributes == ATTR_WIN95) { if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) continue; chksum = win2unixfn(&nb, (struct winentry *)dep, chksum, pmp); continue; } chksum = winChkName(&nb, (const u_char *)cnp->cn_nameptr, unlen, chksum, pmp); if (chksum == -2) { chksum = -1; continue; } /* * Ignore volume labels (anywhere, not just * the root directory). */ if (dep->deAttributes & ATTR_VOLUME) { chksum = -1; continue; } /* * Check for a checksum or name match */ chksum_ok = (chksum == winChksum(dep->deName)); if (!chksum_ok && (!olddos || bcmp(dosfilename, dep->deName, 11))) { chksum = -1; continue; } #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): match blkoff %d, diroff %d\n", blkoff, diroff); #endif /* * Remember where this directory * entry came from for whoever did * this lookup. */ dp->de_fndoffset = diroff; if (chksum_ok && nameiop == RENAME) { /* * Target had correct long name * directory entries, reuse them * as needed. */ dp->de_fndcnt = wincnt - 1; } else { /* * Long name directory entries * not present or corrupt, can only * reuse dos directory entry. */ dp->de_fndcnt = 0; } goto found; } } /* for (blkoff = 0; .... */ /* * Release the buffer holding the directory cluster just * searched. */ brelse(bp); } /* for (frcn = 0; ; frcn++) */ notfound: /* * We hold no disk buffers at this point. */ /* * Fixup the slot description to point to the place where * we might put the new DOS direntry (putting the Win95 * long name entries before that) */ if (!slotcount) { slotcount = 1; slotoffset = diroff; } if (wincnt > slotcount) slotoffset += sizeof(struct direntry) * (wincnt - slotcount); /* * If we get here we didn't find the entry we were looking for. But * that's ok if we are creating or renaming and are at the end of * the pathname and the directory hasn't been removed. */ #ifdef MSDOSFS_DEBUG printf("msdosfs_lookup(): op %d, refcnt %ld\n", nameiop, dp->de_refcnt); printf(" slotcount %d, slotoffset %d\n", slotcount, slotoffset); #endif if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN) && dp->de_refcnt != 0) { /* * Access for write is interpreted as allowing * creation of files in the directory. */ error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, curthread); if (error) return (error); /* * Return an indication of where the new directory * entry should be put. */ dp->de_fndoffset = slotoffset; dp->de_fndcnt = wincnt - 1; /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to do a direnter(). * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. - * The pathname buffer is saved so that the name - * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ - cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } #if 0 /* * Insert name into cache (as non-existent) if appropriate. * * XXX Negative caching is broken for msdosfs because the name * cache doesn't understand peculiarities such as case insensitivity * and 8.3 filenames. Hence, it may not invalidate all negative * entries if a file with this name is later created. */ if ((cnp->cn_flags & MAKEENTRY) != 0) cache_enter(vdp, *vpp, cnp); #endif return (ENOENT); found: /* * NOTE: We still have the buffer with matched directory entry at * this point. */ isadir = dep->deAttributes & ATTR_DIRECTORY; scn = getushort(dep->deStartCluster); if (FAT32(pmp)) { scn |= getushort(dep->deHighClust) << 16; if (scn == pmp->pm_rootdirblk) { /* * There should actually be 0 here. * Just ignore the error. */ scn = MSDOSFSROOT; } } if (isadir) { cluster = scn; if (cluster == MSDOSFSROOT) blkoff = MSDOSFSROOT_OFS; else blkoff = 0; } else if (cluster == MSDOSFSROOT) blkoff = diroff; /* * Now release buf to allow deget to read the entry again. * Reserving it here and giving it to deget could result * in a deadlock. */ brelse(bp); bp = NULL; foundroot: /* * If we entered at foundroot, then we are looking for the . or .. * entry of the filesystems root directory. isadir and scn were * setup before jumping here. And, bp is already null. */ if (FAT32(pmp) && scn == MSDOSFSROOT) scn = pmp->pm_rootdirblk; if (scnp != NULL) { *scnp = cluster; *blkoffp = blkoff; return (0); } /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. */ if (nameiop == DELETE && (flags & ISLASTCN)) { /* * Don't allow deleting the root. */ if (blkoff == MSDOSFSROOT_OFS) return (EBUSY); /* * Write access to directory required to delete files. */ error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, curthread); if (error) return (error); /* * Return pointer to current entry in dp->i_offset. * Save directory inode pointer in ndp->ni_dvp for dirremove(). */ if (dp->de_StartCluster == scn && isadir) { /* "." */ VREF(vdp); *vpp = vdp; return (0); } error = deget(pmp, cluster, blkoff, LK_EXCLUSIVE, &tdp); if (error) return (error); return (msdosfs_lookup_checker(pmp, vdp, tdp, vpp)); } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && (flags & ISLASTCN)) { if (blkoff == MSDOSFSROOT_OFS) return (EBUSY); error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, curthread); if (error) return (error); /* * Careful about locking second inode. * This can only occur if the target is ".". */ if (dp->de_StartCluster == scn && isadir) return (EISDIR); if ((error = deget(pmp, cluster, blkoff, LK_EXCLUSIVE, &tdp)) != 0) return (error); if ((error = msdosfs_lookup_checker(pmp, vdp, tdp, vpp)) != 0) return (error); - cnp->cn_flags |= SAVENAME; return (0); } /* * Step through the translation in the name. We do not `vput' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking * the directory to insure that the inode will not be removed * before we get it. We prevent deadlock by always fetching * inodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. */ pdp = vdp; if (flags & ISDOTDOT) { dd_arg.cluster = cluster; dd_arg.blkoff = blkoff; error = vn_vget_ino_gen(vdp, msdosfs_deget_dotdot, &dd_arg, cnp->cn_lkflags, vpp); if (error != 0) { *vpp = NULL; return (error); } /* * Recheck that ".." still points to the inode we * looked up before pdp lock was dropped. */ error = msdosfs_lookup_ino(pdp, NULL, cnp, &scn, &blkoff); if (error) { vput(*vpp); *vpp = NULL; return (error); } if (FAT32(pmp) && scn == MSDOSFSROOT) scn = pmp->pm_rootdirblk; inode1 = scn * pmp->pm_bpcluster + blkoff; if (VTODE(*vpp)->de_inode != inode1) { vput(*vpp); goto restart; } error = msdosfs_lookup_checker(pmp, vdp, VTODE(*vpp), vpp); if (error != 0) return (error); } else if (dp->de_StartCluster == scn && isadir) { if (cnp->cn_namelen != 1 || cnp->cn_nameptr[0] != '.') { /* fs is corrupted, non-dot lookup returned dvp */ msdosfs_integrity_error(pmp); return (EBADF); } VREF(vdp); /* we want ourself, ie "." */ *vpp = vdp; } else { if ((error = deget(pmp, cluster, blkoff, LK_EXCLUSIVE, &tdp)) != 0) return (error); if ((error = msdosfs_lookup_checker(pmp, vdp, tdp, vpp)) != 0) return (error); } /* * Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); } /* * dep - directory entry to copy into the directory * ddep - directory to add to * depp - return the address of the denode for the created directory entry * if depp != 0 * cnp - componentname needed for Win95 long filenames */ int createde(struct denode *dep, struct denode *ddep, struct denode **depp, struct componentname *cnp) { int error; u_long dirclust, diroffset; struct direntry *ndep; struct msdosfsmount *pmp = ddep->de_pmp; struct buf *bp; daddr_t bn; int blsize; #ifdef MSDOSFS_DEBUG printf("createde(dep %p, ddep %p, depp %p, cnp %p)\n", dep, ddep, depp, cnp); #endif /* * If no space left in the directory then allocate another cluster * and chain it onto the end of the file. There is one exception * to this. That is, if the root directory has no more space it * can NOT be expanded. extendfile() checks for and fails attempts * to extend the root directory. We just return an error in that * case. */ if (ddep->de_fndoffset >= ddep->de_FileSize) { diroffset = ddep->de_fndoffset + sizeof(struct direntry) - ddep->de_FileSize; dirclust = de_clcount(pmp, diroffset); error = extendfile(ddep, dirclust, 0, 0, DE_CLEAR); if (error) { (void)detrunc(ddep, ddep->de_FileSize, 0, NOCRED); return error; } /* * Update the size of the directory */ ddep->de_FileSize += de_cn2off(pmp, dirclust); } /* * We just read in the cluster with space. Copy the new directory * entry in. Then write it to disk. NOTE: DOS directories * do not get smaller as clusters are emptied. */ error = pcbmap(ddep, de_cluster(pmp, ddep->de_fndoffset), &bn, &dirclust, &blsize); if (error) return error; diroffset = ddep->de_fndoffset; if (dirclust != MSDOSFSROOT) diroffset &= pmp->pm_crbomask; if ((error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp)) != 0) { brelse(bp); return error; } ndep = bptoep(pmp, bp, ddep->de_fndoffset); DE_EXTERNALIZE(ndep, dep); /* * Now write the Win95 long name */ if (ddep->de_fndcnt > 0) { uint8_t chksum = winChksum(ndep->deName); const u_char *un = (const u_char *)cnp->cn_nameptr; int unlen = cnp->cn_namelen; int cnt = 1; while (--ddep->de_fndcnt >= 0) { if (!(ddep->de_fndoffset & pmp->pm_crbomask)) { if (DOINGASYNC(DETOV(ddep))) bdwrite(bp); else if ((error = bwrite(bp)) != 0) return error; ddep->de_fndoffset -= sizeof(struct direntry); error = pcbmap(ddep, de_cluster(pmp, ddep->de_fndoffset), &bn, 0, &blsize); if (error) return error; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { return error; } ndep = bptoep(pmp, bp, ddep->de_fndoffset); } else { ndep--; ddep->de_fndoffset -= sizeof(struct direntry); } if (!unix2winfn(un, unlen, (struct winentry *)ndep, cnt++, chksum, pmp)) break; } } if (DOINGASYNC(DETOV(ddep))) bdwrite(bp); else if ((error = bwrite(bp)) != 0) return error; /* * If they want us to return with the denode gotten. */ if (depp) { if (dep->de_Attributes & ATTR_DIRECTORY) { dirclust = dep->de_StartCluster; if (FAT32(pmp) && dirclust == pmp->pm_rootdirblk) dirclust = MSDOSFSROOT; if (dirclust == MSDOSFSROOT) diroffset = MSDOSFSROOT_OFS; else diroffset = 0; } return (deget(pmp, dirclust, diroffset, LK_EXCLUSIVE, depp)); } return 0; } /* * Be sure a directory is empty except for "." and "..". Return 1 if empty, * return 0 if not empty or error. */ int dosdirempty(struct denode *dep) { int blsize; int error; u_long cn; daddr_t bn; struct buf *bp; struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; /* * Since the filesize field in directory entries for a directory is * zero, we just have to feel our way through the directory until * we hit end of file. */ for (cn = 0;; cn++) { if ((error = pcbmap(dep, cn, &bn, 0, &blsize)) != 0) { if (error == E2BIG) return (1); /* it's empty */ return (0); } error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { return (0); } for (dentp = (struct direntry *)bp->b_data; (char *)dentp < bp->b_data + blsize; dentp++) { if (dentp->deName[0] != SLOT_DELETED && (dentp->deAttributes & ATTR_VOLUME) == 0) { /* * In dos directories an entry whose name * starts with SLOT_EMPTY (0) starts the * beginning of the unused part of the * directory, so we can just return that it * is empty. */ if (dentp->deName[0] == SLOT_EMPTY) { brelse(bp); return (1); } /* * Any names other than "." and ".." in a * directory mean it is not empty. */ if (bcmp(dentp->deName, ". ", 11) && bcmp(dentp->deName, ".. ", 11)) { brelse(bp); #ifdef MSDOSFS_DEBUG printf("dosdirempty(): entry found %02x, %02x\n", dentp->deName[0], dentp->deName[1]); #endif return (0); /* not empty */ } } } brelse(bp); } /* NOTREACHED */ } /* * Check to see if the directory described by target is in some * subdirectory of source. This prevents something like the following from * succeeding and leaving a bunch or files and directories orphaned. mv * /a/b/c /a/b/c/d/e/f Where c and f are directories. * * source - the inode for /a/b/c * target - the inode for /a/b/c/d/e/f * * Returns 0 if target is NOT a subdirectory of source. * Otherwise returns a non-zero error number. */ int doscheckpath(struct denode *source, struct denode *target, daddr_t *wait_scn) { daddr_t scn; struct msdosfsmount *pmp; struct direntry *ep; struct denode *dep; struct buf *bp = NULL; int error = 0; *wait_scn = 0; pmp = target->de_pmp; lockmgr_assert(&pmp->pm_checkpath_lock, KA_XLOCKED); KASSERT(pmp == source->de_pmp, ("doscheckpath: source and target on different filesystems")); if ((target->de_Attributes & ATTR_DIRECTORY) == 0 || (source->de_Attributes & ATTR_DIRECTORY) == 0) return (ENOTDIR); if (target->de_StartCluster == source->de_StartCluster) return (EEXIST); if (target->de_StartCluster == MSDOSFSROOT || (FAT32(pmp) && target->de_StartCluster == pmp->pm_rootdirblk)) return (0); dep = target; vget(DETOV(dep), LK_EXCLUSIVE); for (;;) { if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) { error = ENOTDIR; break; } scn = dep->de_StartCluster; error = bread(pmp->pm_devvp, cntobn(pmp, scn), pmp->pm_bpcluster, NOCRED, &bp); if (error != 0) break; ep = (struct direntry *)bp->b_data + 1; if ((ep->deAttributes & ATTR_DIRECTORY) == 0 || bcmp(ep->deName, ".. ", 11) != 0) { error = ENOTDIR; brelse(bp); break; } scn = getushort(ep->deStartCluster); if (FAT32(pmp)) scn |= getushort(ep->deHighClust) << 16; brelse(bp); if (scn == source->de_StartCluster) { error = EINVAL; break; } if (scn == MSDOSFSROOT) break; if (FAT32(pmp) && scn == pmp->pm_rootdirblk) { /* * scn should be 0 in this case, * but we silently ignore the error. */ break; } vput(DETOV(dep)); dep = NULL; /* NOTE: deget() clears dep on error */ error = deget(pmp, scn, 0, LK_EXCLUSIVE | LK_NOWAIT, &dep); if (error != 0) { *wait_scn = scn; break; } } #ifdef MSDOSFS_DEBUG if (error == ENOTDIR) printf("doscheckpath(): .. not a directory?\n"); #endif if (dep != NULL) vput(DETOV(dep)); return (error); } /* * Read in the disk block containing the directory entry (dirclu, dirofs) * and return the address of the buf header, and the address of the * directory entry within the block. */ int readep(struct msdosfsmount *pmp, u_long dirclust, u_long diroffset, struct buf **bpp, struct direntry **epp) { int error; daddr_t bn; int blsize; blsize = pmp->pm_bpcluster; if (dirclust == MSDOSFSROOT && de_blk(pmp, diroffset + blsize) > pmp->pm_rootdirsize) blsize = de_bn2off(pmp, pmp->pm_rootdirsize) & pmp->pm_crbomask; bn = detobn(pmp, dirclust, diroffset); if ((error = bread(pmp->pm_devvp, bn, blsize, NOCRED, bpp)) != 0) { brelse(*bpp); *bpp = NULL; return (error); } if (epp) *epp = bptoep(pmp, *bpp, diroffset); return (0); } /* * Read in the disk block containing the directory entry dep came from and * return the address of the buf header, and the address of the directory * entry within the block. */ int readde(struct denode *dep, struct buf **bpp, struct direntry **epp) { return (readep(dep->de_pmp, dep->de_dirclust, dep->de_diroffset, bpp, epp)); } /* * Remove a directory entry. At this point the file represented by the * directory entry to be removed is still full length until no one has it * open. When the file no longer being used msdosfs_inactive() is called * and will truncate the file to 0 length. When the vnode containing the * denode is needed for some other purpose by VFS it will call * msdosfs_reclaim() which will remove the denode from the denode cache. * * pdep directory where the entry is removed * dep file to be removed */ int removede(struct denode *pdep, struct denode *dep) { int error; struct direntry *ep; struct buf *bp; daddr_t bn; int blsize; struct msdosfsmount *pmp = pdep->de_pmp; u_long offset = pdep->de_fndoffset; #ifdef MSDOSFS_DEBUG printf("removede(): filename %s, dep %p, offset %08lx\n", dep->de_Name, dep, offset); #endif dep->de_refcnt--; offset += sizeof(struct direntry); do { offset -= sizeof(struct direntry); error = pcbmap(pdep, de_cluster(pmp, offset), &bn, 0, &blsize); if (error) return error; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { return error; } ep = bptoep(pmp, bp, offset); /* * Check whether, if we came here the second time, i.e. * when underflowing into the previous block, the last * entry in this block is a longfilename entry, too. */ if (ep->deAttributes != ATTR_WIN95 && offset != pdep->de_fndoffset) { brelse(bp); break; } offset += sizeof(struct direntry); while (1) { /* * We are a bit aggressive here in that we delete any Win95 * entries preceding this entry, not just the ones we "own". * Since these presumably aren't valid anyway, * there should be no harm. */ offset -= sizeof(struct direntry); ep--->deName[0] = SLOT_DELETED; if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) || !(offset & pmp->pm_crbomask) || ep->deAttributes != ATTR_WIN95) break; } if (DOINGASYNC(DETOV(pdep))) bdwrite(bp); else if ((error = bwrite(bp)) != 0) return error; } while (!(pmp->pm_flags & MSDOSFSMNT_NOWIN95) && !(offset & pmp->pm_crbomask) && offset); return 0; } /* * Create a unique DOS name in dvp */ int uniqdosname(struct denode *dep, struct componentname *cnp, u_char *cp) { struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; int gen; int blsize; u_long cn; daddr_t bn; struct buf *bp; int error; if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) return (unix2dosfn((const u_char *)cnp->cn_nameptr, cp, cnp->cn_namelen, 0, pmp) ? 0 : EINVAL); for (gen = 1;; gen++) { /* * Generate DOS name with generation number */ if (!unix2dosfn((const u_char *)cnp->cn_nameptr, cp, cnp->cn_namelen, gen, pmp)) return gen == 1 ? EINVAL : EEXIST; /* * Now look for a dir entry with this exact name */ for (cn = error = 0; !error; cn++) { if ((error = pcbmap(dep, cn, &bn, 0, &blsize)) != 0) { if (error == E2BIG) /* EOF reached and not found */ return 0; return error; } error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { return error; } for (dentp = (struct direntry *)bp->b_data; (char *)dentp < bp->b_data + blsize; dentp++) { if (dentp->deName[0] == SLOT_EMPTY) { /* * Last used entry and not found */ brelse(bp); return 0; } /* * Ignore volume labels and Win95 entries */ if (dentp->deAttributes & ATTR_VOLUME) continue; if (!bcmp(dentp->deName, cp, 11)) { error = EEXIST; break; } } brelse(bp); } } } diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c index 36da35257d2c..f5a07cd5f492 100644 --- a/sys/fs/msdosfs/msdosfs_vnops.c +++ b/sys/fs/msdosfs/msdosfs_vnops.c @@ -1,2005 +1,1992 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Prototypes for MSDOSFS vnode operations */ static vop_create_t msdosfs_create; static vop_mknod_t msdosfs_mknod; static vop_open_t msdosfs_open; static vop_close_t msdosfs_close; static vop_access_t msdosfs_access; static vop_getattr_t msdosfs_getattr; static vop_setattr_t msdosfs_setattr; static vop_read_t msdosfs_read; static vop_write_t msdosfs_write; static vop_fsync_t msdosfs_fsync; static vop_remove_t msdosfs_remove; static vop_link_t msdosfs_link; static vop_rename_t msdosfs_rename; static vop_mkdir_t msdosfs_mkdir; static vop_rmdir_t msdosfs_rmdir; static vop_symlink_t msdosfs_symlink; static vop_readdir_t msdosfs_readdir; static vop_bmap_t msdosfs_bmap; static vop_getpages_t msdosfs_getpages; static vop_strategy_t msdosfs_strategy; static vop_print_t msdosfs_print; static vop_pathconf_t msdosfs_pathconf; static vop_vptofh_t msdosfs_vptofh; /* * Some general notes: * * In the ufs filesystem the inodes, superblocks, and indirect blocks are * read/written using the vnode for the filesystem. Blocks that represent * the contents of a file are read/written using the vnode for the file * (including directories when they are read/written as files). This * presents problems for the dos filesystem because data that should be in * an inode (if dos had them) resides in the directory itself. Since we * must update directory entries without the benefit of having the vnode * for the directory we must use the vnode for the filesystem. This means * that when a directory is actually read/written (via read, write, or * readdir, or seek) we must use the vnode for the filesystem instead of * the vnode for the directory as would happen in ufs. This is to insure we * retrieve the correct block from the buffer cache since the hash value is * based upon the vnode address and the desired block number. */ /* * Create a regular file. On entry the directory to contain the file being * created is locked. We must release before we return. We must also free * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or * only if the SAVESTART bit in cn_flags is clear on success. */ static int msdosfs_create(struct vop_create_args *ap) { struct componentname *cnp = ap->a_cnp; struct denode ndirent; struct denode *dep; struct denode *pdep = VTODE(ap->a_dvp); struct timespec ts; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_create(cnp %p, vap %p\n", cnp, ap->a_vap); #endif /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad; } /* * Create a directory entry for the file, then call createde() to * have it installed. NOTE: DOS files are always executable. We * use the absence of the owner write bit to make the file * readonly. */ -#ifdef DIAGNOSTIC - if ((cnp->cn_flags & HASBUF) == 0) - panic("msdosfs_create: no name"); -#endif memset(&ndirent, 0, sizeof(ndirent)); error = uniqdosname(pdep, cnp, ndirent.de_Name); if (error) goto bad; ndirent.de_Attributes = ATTR_ARCHIVE; ndirent.de_LowerCase = 0; ndirent.de_StartCluster = 0; ndirent.de_FileSize = 0; ndirent.de_pmp = pdep->de_pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; vfs_timestamp(&ts); DETIMES(&ndirent, &ts, &ts, &ts); error = createde(&ndirent, pdep, &dep, cnp); if (error) goto bad; *ap->a_vpp = DETOV(dep); if ((cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (0); bad: return (error); } static int msdosfs_mknod(struct vop_mknod_args *ap) { return (EINVAL); } static int msdosfs_open(struct vop_open_args *ap) { struct denode *dep = VTODE(ap->a_vp); vnode_create_vobject(ap->a_vp, dep->de_FileSize, ap->a_td); return 0; } static int msdosfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct timespec ts; VI_LOCK(vp); if (vp->v_usecount > 1) { vfs_timestamp(&ts); DETIMES(dep, &ts, &ts, &ts); } VI_UNLOCK(vp); return 0; } static int msdosfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; mode_t file_mode; accmode_t accmode = ap->a_accmode; file_mode = S_IRWXU|S_IRWXG|S_IRWXO; file_mode &= (vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask); /* * Disallow writing to directories and regular files if the * filesystem is read-only. */ if (accmode & VWRITE) { switch (vp->v_type) { case VREG: case VDIR: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } return (vaccess(vp->v_type, file_mode, pmp->pm_uid, pmp->pm_gid, ap->a_accmode, ap->a_cred)); } static int msdosfs_getattr(struct vop_getattr_args *ap) { struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct vattr *vap = ap->a_vap; mode_t mode; struct timespec ts; u_long dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry); uint64_t fileid; vfs_timestamp(&ts); DETIMES(dep, &ts, &ts, &ts); vap->va_fsid = dev2udev(pmp->pm_dev); /* * The following computation of the fileid must be the same as that * used in msdosfs_readdir() to compute d_fileno. If not, pwd * doesn't work. */ if (dep->de_Attributes & ATTR_DIRECTORY) { fileid = (uint64_t)cntobn(pmp, dep->de_StartCluster) * dirsperblk; if (dep->de_StartCluster == MSDOSFSROOT) fileid = 1; } else { fileid = (uint64_t)cntobn(pmp, dep->de_dirclust) * dirsperblk; if (dep->de_dirclust == MSDOSFSROOT) fileid = (uint64_t)roottobn(pmp, 0) * dirsperblk; fileid += (uoff_t)dep->de_diroffset / sizeof(struct direntry); } vap->va_fileid = fileid; mode = S_IRWXU|S_IRWXG|S_IRWXO; if (dep->de_Attributes & ATTR_READONLY) mode &= ~(S_IWUSR|S_IWGRP|S_IWOTH); vap->va_mode = mode & (ap->a_vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask); vap->va_uid = pmp->pm_uid; vap->va_gid = pmp->pm_gid; vap->va_nlink = 1; vap->va_rdev = NODEV; vap->va_size = dep->de_FileSize; fattime2timespec(dep->de_MDate, dep->de_MTime, 0, 0, &vap->va_mtime); vap->va_ctime = vap->va_mtime; if (pmp->pm_flags & MSDOSFSMNT_LONGNAME) { fattime2timespec(dep->de_ADate, 0, 0, 0, &vap->va_atime); fattime2timespec(dep->de_CDate, dep->de_CTime, dep->de_CHun, 0, &vap->va_birthtime); } else { vap->va_atime = vap->va_mtime; vap->va_birthtime.tv_sec = -1; vap->va_birthtime.tv_nsec = 0; } vap->va_flags = 0; if (dep->de_Attributes & ATTR_ARCHIVE) vap->va_flags |= UF_ARCHIVE; if (dep->de_Attributes & ATTR_HIDDEN) vap->va_flags |= UF_HIDDEN; if (dep->de_Attributes & ATTR_READONLY) vap->va_flags |= UF_READONLY; if (dep->de_Attributes & ATTR_SYSTEM) vap->va_flags |= UF_SYSTEM; vap->va_gen = 0; vap->va_blocksize = pmp->pm_bpcluster; vap->va_bytes = (dep->de_FileSize + pmp->pm_crbomask) & ~pmp->pm_crbomask; vap->va_type = ap->a_vp->v_type; vap->va_filerev = dep->de_modrev; return (0); } static int msdosfs_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; int error = 0; #ifdef MSDOSFS_DEBUG printf("msdosfs_setattr(): vp %p, vap %p, cred %p\n", ap->a_vp, vap, cred); #endif /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { #ifdef MSDOSFS_DEBUG printf("msdosfs_setattr(): returning EINVAL\n"); printf(" va_type %d, va_nlink %llx, va_fsid %llx, va_fileid %llx\n", vap->va_type, (unsigned long long)vap->va_nlink, (unsigned long long)vap->va_fsid, (unsigned long long)vap->va_fileid); printf(" va_blocksize %lx, va_rdev %llx, va_bytes %llx, va_gen %lx\n", vap->va_blocksize, (unsigned long long)vap->va_rdev, (unsigned long long)vap->va_bytes, vap->va_gen); printf(" va_uid %x, va_gid %x\n", vap->va_uid, vap->va_gid); #endif return (EINVAL); } /* * We don't allow setting attributes on the root directory. * The special case for the root directory is because before * FAT32, the root directory didn't have an entry for itself * (and was otherwise special). With FAT32, the root * directory is not so special, but still doesn't have an * entry for itself. */ if (vp->v_vflag & VV_ROOT) return (EINVAL); if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN); if (error) return (error); } /* * We are very inconsistent about handling unsupported * attributes. We ignored the access time and the * read and execute bits. We were strict for the other * attributes. */ if (vap->va_flags & ~(UF_ARCHIVE | UF_HIDDEN | UF_READONLY | UF_SYSTEM)) return EOPNOTSUPP; if (vap->va_flags & UF_ARCHIVE) dep->de_Attributes |= ATTR_ARCHIVE; else dep->de_Attributes &= ~ATTR_ARCHIVE; if (vap->va_flags & UF_HIDDEN) dep->de_Attributes |= ATTR_HIDDEN; else dep->de_Attributes &= ~ATTR_HIDDEN; /* We don't allow changing the readonly bit on directories. */ if (vp->v_type != VDIR) { if (vap->va_flags & UF_READONLY) dep->de_Attributes |= ATTR_READONLY; else dep->de_Attributes &= ~ATTR_READONLY; } if (vap->va_flags & UF_SYSTEM) dep->de_Attributes |= ATTR_SYSTEM; else dep->de_Attributes &= ~ATTR_SYSTEM; dep->de_flag |= DE_MODIFIED; } if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { uid_t uid; gid_t gid; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); uid = vap->va_uid; if (uid == (uid_t)VNOVAL) uid = pmp->pm_uid; gid = vap->va_gid; if (gid == (gid_t)VNOVAL) gid = pmp->pm_gid; if (cred->cr_uid != pmp->pm_uid || uid != pmp->pm_uid || (gid != pmp->pm_gid && !groupmember(gid, cred))) { error = priv_check_cred(cred, PRIV_VFS_CHOWN); if (error) return (error); } if (uid != pmp->pm_uid || gid != pmp->pm_gid) return EINVAL; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VREG: /* * Truncation is only supported for regular files, * Disallow it if the filesystem is read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support any file types except regular * files and directories in this file system, so * this (default) case is unreachable and can do * anything. Keep falling through to detrunc() * for now. */ break; } error = detrunc(dep, vap->va_size, 0, cred); if (error) return error; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = vn_utimes_perm(vp, vap, cred, td); if (error != 0) return (error); if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) == 0 && vap->va_atime.tv_sec != VNOVAL) { dep->de_flag &= ~DE_ACCESS; timespec2fattime(&vap->va_atime, 0, &dep->de_ADate, NULL, NULL); } if (vap->va_mtime.tv_sec != VNOVAL) { dep->de_flag &= ~DE_UPDATE; timespec2fattime(&vap->va_mtime, 0, &dep->de_MDate, &dep->de_MTime, NULL); } /* * We don't set the archive bit when modifying the time of * a directory to emulate the Windows/DOS behavior. */ if (vp->v_type != VDIR) dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } /* * DOS files only have the ability to have their writability * attribute set, so we use the owner write bit to set the readonly * attribute. */ if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN); if (error) return (error); } if (vp->v_type != VDIR) { /* We ignore the read and execute bits. */ if (vap->va_mode & S_IWUSR) dep->de_Attributes &= ~ATTR_READONLY; else dep->de_Attributes |= ATTR_READONLY; dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } } return (deupdat(dep, 0)); } static int msdosfs_read(struct vop_read_args *ap) { int error = 0; int blsize; int isadir; ssize_t orig_resid; u_int n; u_long diff; u_long on; daddr_t lbn; daddr_t rablock; int rasize; int seqcount; struct buf *bp; struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct uio *uio = ap->a_uio; /* * If they didn't ask for any data, then we are done. */ orig_resid = uio->uio_resid; if (orig_resid == 0) return (0); /* * The caller is supposed to ensure that * uio->uio_offset >= 0 and uio->uio_resid >= 0. * We don't need to check for large offsets as in ffs because * dep->de_FileSize <= MSDOSFS_FILESIZE_MAX < OFF_MAX, so large * offsets cannot cause overflow even in theory. */ seqcount = ap->a_ioflag >> IO_SEQSHIFT; isadir = dep->de_Attributes & ATTR_DIRECTORY; do { if (uio->uio_offset >= dep->de_FileSize) break; lbn = de_cluster(pmp, uio->uio_offset); rablock = lbn + 1; blsize = pmp->pm_bpcluster; on = uio->uio_offset & pmp->pm_crbomask; /* * If we are operating on a directory file then be sure to * do i/o with the vnode for the filesystem instead of the * vnode for the directory. */ if (isadir) { /* convert cluster # to block # */ error = pcbmap(dep, lbn, &lbn, 0, &blsize); if (error == E2BIG) { error = EINVAL; break; } else if (error) break; error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp); } else if (de_cn2off(pmp, rablock) >= dep->de_FileSize) { error = bread(vp, lbn, blsize, NOCRED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, dep->de_FileSize, lbn, blsize, NOCRED, on + uio->uio_resid, seqcount, 0, &bp); } else if (seqcount > 1) { rasize = blsize; error = breadn(vp, lbn, blsize, &rablock, &rasize, 1, NOCRED, &bp); } else { error = bread(vp, lbn, blsize, NOCRED, &bp); } if (error) { brelse(bp); break; } diff = pmp->pm_bpcluster - on; n = diff > uio->uio_resid ? uio->uio_resid : diff; diff = dep->de_FileSize - uio->uio_offset; if (diff < n) n = diff; diff = blsize - bp->b_resid; if (diff < n) n = diff; error = vn_io_fault_uiomove(bp->b_data + on, (int) n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); if (!isadir && (error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) dep->de_flag |= DE_ACCESS; return (error); } /* * Write data to a file or directory. */ static int msdosfs_write(struct vop_write_args *ap) { int n; int croffset; ssize_t resid; u_long osize; int error = 0; u_long count; int seqcount; daddr_t bn, lastcn; struct buf *bp; int ioflag = ap->a_ioflag; struct uio *uio = ap->a_uio; struct vnode *vp = ap->a_vp; struct vnode *thisvp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct ucred *cred = ap->a_cred; #ifdef MSDOSFS_DEBUG printf("msdosfs_write(vp %p, uio %p, ioflag %x, cred %p\n", vp, uio, ioflag, cred); printf("msdosfs_write(): diroff %lu, dirclust %lu, startcluster %lu\n", dep->de_diroffset, dep->de_dirclust, dep->de_StartCluster); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = dep->de_FileSize; thisvp = vp; break; case VDIR: return EISDIR; default: panic("msdosfs_write(): bad file type"); } /* * This is needed (unlike in ffs_write()) because we extend the * file outside of the loop but we don't want to extend the file * for writes of 0 bytes. */ if (uio->uio_resid == 0) return (0); /* * The caller is supposed to ensure that * uio->uio_offset >= 0 and uio->uio_resid >= 0. */ if ((uoff_t)uio->uio_offset + uio->uio_resid > MSDOSFS_FILESIZE_MAX) return (EFBIG); /* * If they've exceeded their filesize limit, tell them about it. */ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); /* * If the offset we are starting the write at is beyond the end of * the file, then they've done a seek. Unix filesystems allow * files with holes in them, DOS doesn't so we must fill the hole * with zeroed blocks. */ if (uio->uio_offset > dep->de_FileSize) { error = deextend(dep, uio->uio_offset, cred); if (error) return (error); } /* * Remember some values in case the write fails. */ resid = uio->uio_resid; osize = dep->de_FileSize; /* * If we write beyond the end of the file, extend it to its ultimate * size ahead of the time to hopefully get a contiguous area. */ if (uio->uio_offset + resid > osize) { count = de_clcount(pmp, uio->uio_offset + resid) - de_clcount(pmp, osize); error = extendfile(dep, count, NULL, NULL, 0); if (error && (error != ENOSPC || (ioflag & IO_UNIT))) goto errexit; lastcn = dep->de_fc[FC_LASTFC].fc_frcn; } else lastcn = de_clcount(pmp, osize) - 1; seqcount = ioflag >> IO_SEQSHIFT; do { if (de_cluster(pmp, uio->uio_offset) > lastcn) { error = ENOSPC; break; } croffset = uio->uio_offset & pmp->pm_crbomask; n = min(uio->uio_resid, pmp->pm_bpcluster - croffset); if (uio->uio_offset + n > dep->de_FileSize) { dep->de_FileSize = uio->uio_offset + n; /* The object size needs to be set before buffer is allocated */ vnode_pager_setsize(vp, dep->de_FileSize); } bn = de_cluster(pmp, uio->uio_offset); if ((uio->uio_offset & pmp->pm_crbomask) == 0 && (de_cluster(pmp, uio->uio_offset + uio->uio_resid) > de_cluster(pmp, uio->uio_offset) || uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) { /* * If either the whole cluster gets written, * or we write the cluster from its start beyond EOF, * then no need to read data from disk. */ bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0, 0); /* * This call to vfs_bio_clrbuf() ensures that * even if vn_io_fault_uiomove() below faults, * garbage from the newly instantiated buffer * is not exposed to the userspace via mmap(). */ vfs_bio_clrbuf(bp); /* * Do the bmap now, since pcbmap needs buffers * for the FAT table. (see msdosfs_strategy) */ if (bp->b_blkno == bp->b_lblkno) { error = pcbmap(dep, bp->b_lblkno, &bn, 0, 0); if (error) bp->b_blkno = -1; else bp->b_blkno = bn; } if (bp->b_blkno == -1) { brelse(bp); if (!error) error = EIO; /* XXX */ break; } } else { /* * The block we need to write into exists, so read it in. */ error = bread(thisvp, bn, pmp->pm_bpcluster, cred, &bp); if (error) { break; } } /* * Should these vnode_pager_* functions be done on dir * files? */ /* * Copy the data from user space into the buf header. */ error = vn_io_fault_uiomove(bp->b_data + croffset, n, uio); if (error) { brelse(bp); break; } /* Prepare for clustered writes in some else clauses. */ if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) bp->b_flags |= B_CLUSTEROK; /* * If IO_SYNC, then each buffer is written synchronously. * Otherwise, if we have a severe page deficiency then * write the buffer asynchronously. Otherwise, if on a * cluster boundary then write the buffer asynchronously, * combining it with contiguous clusters if permitted and * possible, since we don't expect more writes into this * buffer soon. Otherwise, do a delayed write because we * expect more writes into this buffer soon. */ if (ioflag & IO_SYNC) (void)bwrite(bp); else if (vm_page_count_severe() || buf_dirty_count_severe()) bawrite(bp); else if (n + croffset == pmp->pm_bpcluster) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) cluster_write(vp, &dep->de_clusterw, bp, dep->de_FileSize, seqcount, 0); else bawrite(bp); } else bdwrite(bp); dep->de_flag |= DE_UPDATE; } while (error == 0 && uio->uio_resid > 0); /* * If the write failed and they want us to, truncate the file back * to the size it was before the write was attempted. */ errexit: if (error) { if (ioflag & IO_UNIT) { detrunc(dep, osize, ioflag & IO_SYNC, NOCRED); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } else { detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED); if (uio->uio_resid != resid) error = 0; } } else if (ioflag & IO_SYNC) error = deupdat(dep, 1); return (error); } /* * Flush the blocks of a file to disk. */ static int msdosfs_fsync(struct vop_fsync_args *ap) { struct vnode *devvp; int allerror, error; vop_stdfsync(ap); /* * If the syncing request comes from fsync(2), sync the entire * FAT and any other metadata that happens to be on devvp. We * need this mainly for the FAT. We write the FAT sloppily, and * syncing it all now is the best we can easily do to get all * directory entries associated with the file (not just the file) * fully synced. The other metadata includes critical metadata * for all directory entries, but only in the MNT_ASYNC case. We * will soon sync all metadata in the file's directory entry. * Non-critical metadata for associated directory entries only * gets synced accidentally, as in most file systems. */ if (ap->a_waitfor != MNT_NOWAIT) { devvp = VTODE(ap->a_vp)->de_pmp->pm_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); allerror = VOP_FSYNC(devvp, MNT_WAIT, ap->a_td); VOP_UNLOCK(devvp); } else allerror = 0; error = deupdat(VTODE(ap->a_vp), ap->a_waitfor != MNT_NOWAIT); if (allerror == 0) allerror = error; return (allerror); } static int msdosfs_remove(struct vop_remove_args *ap) { struct denode *dep = VTODE(ap->a_vp); struct denode *ddep = VTODE(ap->a_dvp); int error; if (ap->a_vp->v_type == VDIR) error = EPERM; else error = removede(ddep, dep); #ifdef MSDOSFS_DEBUG printf("msdosfs_remove(), dep %p, v_usecount %d\n", dep, ap->a_vp->v_usecount); #endif return (error); } /* * DOS filesystems don't know what links are. */ static int msdosfs_link(struct vop_link_args *ap) { return (EOPNOTSUPP); } /* * Renames on files require moving the denode to a new hash queue since the * denode's location is used to compute which hash queue to put the file * in. Unless it is a rename in place. For example "mv a b". * * What follows is the basic algorithm: * * if (file move) { * if (dest file exists) { * remove dest file * } * if (dest and src in same directory) { * rewrite name in existing directory slot * } else { * write new entry in dest directory * update offset and dirclust in denode * move denode to new hash chain * clear old directory entry * } * } else { * directory move * if (dest directory exists) { * if (dest is not empty) { * return ENOTEMPTY * } * remove dest directory * } * if (dest and src in same directory) { * rewrite name in existing entry * } else { * be sure dest is not a child of src directory * write entry in dest directory * update "." and ".." in moved directory * clear old directory entry for moved directory * } * } * * On entry: * source's parent directory is unlocked * source file or directory is unlocked * destination's parent directory is locked * destination file or directory is locked if it exists * * On exit: * all denodes should be released */ static int msdosfs_rename(struct vop_rename_args *ap) { struct vnode *fdvp, *fvp, *tdvp, *tvp, *vp; struct componentname *fcnp, *tcnp; struct denode *fdip, *fip, *tdip, *tip, *nip; u_char toname[12], oldname[11]; u_long to_diroffset; bool checkpath_locked, doingdirectory, newparent; int error; u_long cn, pcl, blkoff; daddr_t bn, wait_scn, scn; struct msdosfsmount *pmp; struct direntry *dotdotp; struct buf *bp; tdvp = ap->a_tdvp; fvp = ap->a_fvp; fdvp = ap->a_fdvp; tvp = ap->a_tvp; tcnp = ap->a_tcnp; fcnp = ap->a_fcnp; pmp = VFSTOMSDOSFS(fdvp->v_mount); -#ifdef DIAGNOSTIC - if ((tcnp->cn_flags & HASBUF) == 0 || - (fcnp->cn_flags & HASBUF) == 0) - panic("msdosfs_rename: no name"); -#endif /* * Check for cross-device rename. */ if (fvp->v_mount != tdvp->v_mount || (tvp != NULL && fvp->v_mount != tvp->v_mount)) { error = EXDEV; goto abortit; } /* * If source and dest are the same, do nothing. */ if (tvp == fvp) { error = 0; goto abortit; } /* * When the target exists, both the directory * and target vnodes are passed locked. */ VOP_UNLOCK(tdvp); if (tvp != NULL && tvp != tdvp) VOP_UNLOCK(tvp); checkpath_locked = false; relock: doingdirectory = newparent = false; error = vn_lock(fdvp, LK_EXCLUSIVE); if (error != 0) goto releout; if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(fdvp); error = vn_lock(tdvp, LK_EXCLUSIVE); if (error != 0) goto releout; VOP_UNLOCK(tdvp); goto relock; } error = msdosfs_lookup_ino(fdvp, NULL, fcnp, &scn, &blkoff); if (error != 0) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); goto releout; } error = deget(pmp, scn, blkoff, LK_EXCLUSIVE | LK_NOWAIT, &nip); if (error != 0) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); if (error != EBUSY) goto releout; error = deget(pmp, scn, blkoff, LK_EXCLUSIVE, &nip); if (error != 0) goto releout; vp = fvp; fvp = DETOV(nip); VOP_UNLOCK(fvp); vrele(vp); goto relock; } vrele(fvp); fvp = DETOV(nip); error = msdosfs_lookup_ino(tdvp, NULL, tcnp, &scn, &blkoff); if (error != 0 && error != EJUSTRETURN) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); VOP_UNLOCK(fvp); goto releout; } if (error == EJUSTRETURN && tvp != NULL) { vrele(tvp); tvp = NULL; } if (error == 0) { nip = NULL; error = deget(pmp, scn, blkoff, LK_EXCLUSIVE | LK_NOWAIT, &nip); if (tvp != NULL) { vrele(tvp); tvp = NULL; } if (error != 0) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); VOP_UNLOCK(fvp); if (error != EBUSY) goto releout; error = deget(pmp, scn, blkoff, LK_EXCLUSIVE, &nip); if (error != 0) goto releout; vput(DETOV(nip)); goto relock; } tvp = DETOV(nip); } fdip = VTODE(fdvp); fip = VTODE(fvp); tdip = VTODE(tdvp); tip = tvp != NULL ? VTODE(tvp) : NULL; /* * Remember direntry place to use for destination */ to_diroffset = tdip->de_fndoffset; /* * Be sure we are not renaming ".", "..", or an alias of ".". This * leads to a crippled directory tree. It's pretty tough to do a * "ls" or "pwd" with the "." directory entry missing, and "cd .." * doesn't work if the ".." entry is missing. */ if ((fip->de_Attributes & ATTR_DIRECTORY) != 0) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || fdip == fip || (fcnp->cn_flags & ISDOTDOT) != 0 || (tcnp->cn_flags & ISDOTDOT) != 0) { error = EINVAL; goto unlock; } doingdirectory = true; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to doscheckpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, curthread); if (fdip->de_StartCluster != tdip->de_StartCluster) newparent = true; if (doingdirectory && newparent) { if (error != 0) /* write access check above */ goto unlock; lockmgr(&pmp->pm_checkpath_lock, LK_EXCLUSIVE, NULL); checkpath_locked = true; error = doscheckpath(fip, tdip, &wait_scn); if (wait_scn != 0) { lockmgr(&pmp->pm_checkpath_lock, LK_RELEASE, NULL); checkpath_locked = false; VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); VOP_UNLOCK(fvp); if (tvp != NULL && tvp != tdvp) VOP_UNLOCK(tvp); error = deget(pmp, wait_scn, 0, LK_EXCLUSIVE, &nip); if (error == 0) { vput(DETOV(nip)); goto relock; } } if (error != 0) goto unlock; if ((tcnp->cn_flags & SAVESTART) == 0) panic("msdosfs_rename: lost to startdir"); } if (tip != NULL) { /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((tip->de_Attributes & ATTR_DIRECTORY) != 0) { if (!dosdirempty(tip)) { error = ENOTEMPTY; goto unlock; } if (!doingdirectory) { error = ENOTDIR; goto unlock; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto unlock; } error = msdosfs_lookup_ino(tdvp, NULL, tcnp, &scn, &blkoff); MPASS(error == 0); error = removede(tdip, tip); if (error != 0) goto unlock; vput(tvp); tvp = NULL; tip = NULL; } /* * Convert the filename in tcnp into a dos filename. We copy this * into the denode and directory entry for the destination * file/directory. */ error = uniqdosname(tdip, tcnp, toname); if (error != 0) goto unlock; /* * First write a new entry in the destination * directory and mark the entry in the source directory * as deleted. Then move the denode to the correct hash * chain for its new location in the filesystem. And, if * we moved a directory, then update its .. entry to point * to the new parent directory. */ memcpy(oldname, fip->de_Name, 11); memcpy(fip->de_Name, toname, 11); /* update denode */ error = msdosfs_lookup_ino(tdvp, NULL, tcnp, &scn, &blkoff); MPASS(error == EJUSTRETURN); error = createde(fip, tdip, NULL, tcnp); if (error != 0) { memcpy(fip->de_Name, oldname, 11); goto unlock; } /* * If fip is for a directory, then its name should always * be "." since it is for the directory entry in the * directory itself (msdosfs_lookup() always translates * to the "." entry so as to get a unique denode, except * for the root directory there are different * complications). However, we just corrupted its name * to pass the correct name to createde(). Undo this. */ if ((fip->de_Attributes & ATTR_DIRECTORY) != 0) memcpy(fip->de_Name, oldname, 11); fip->de_refcnt++; error = msdosfs_lookup_ino(fdvp, NULL, fcnp, &scn, &blkoff); MPASS(error == 0); error = removede(fdip, fip); if (error != 0) { /* XXX should downgrade to ro here, fs is corrupt */ goto unlock; } if (!doingdirectory) { error = pcbmap(tdip, de_cluster(pmp, to_diroffset), 0, &fip->de_dirclust, 0); if (error != 0) { /* * XXX should downgrade to ro here, * fs is corrupt */ goto unlock; } if (fip->de_dirclust == MSDOSFSROOT) fip->de_diroffset = to_diroffset; else fip->de_diroffset = to_diroffset & pmp->pm_crbomask; } reinsert(fip); /* * If we moved a directory to a new parent directory, then we must * fixup the ".." entry in the moved directory. */ if (doingdirectory && newparent) { cn = fip->de_StartCluster; if (cn == MSDOSFSROOT) { /* this should never happen */ panic("msdosfs_rename(): updating .. in root directory?"); } else bn = cntobn(pmp, cn); error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster, NOCRED, &bp); if (error != 0) { /* XXX should downgrade to ro here, fs is corrupt */ goto unlock; } dotdotp = (struct direntry *)bp->b_data + 1; pcl = tdip->de_StartCluster; if (FAT32(pmp) && pcl == pmp->pm_rootdirblk) pcl = MSDOSFSROOT; putushort(dotdotp->deStartCluster, pcl); if (FAT32(pmp)) putushort(dotdotp->deHighClust, pcl >> 16); if (DOINGASYNC(fvp)) bdwrite(bp); else if ((error = bwrite(bp)) != 0) { /* XXX should downgrade to ro here, fs is corrupt */ goto unlock; } } /* * The msdosfs lookup is case insensitive. Several aliases may * be inserted for a single directory entry. As a consequnce, * name cache purge done by lookup for fvp when DELETE op for * namei is specified, might be not enough to expunge all * namecache entries that were installed for this direntry. */ cache_purge(fvp); unlock: if (checkpath_locked) lockmgr(&pmp->pm_checkpath_lock, LK_RELEASE, NULL); vput(fdvp); vput(fvp); if (tvp != NULL) { if (tvp != tdvp) vput(tvp); else vrele(tvp); } vput(tdvp); return (error); releout: MPASS(!checkpath_locked); vrele(tdvp); if (tvp != NULL) vrele(tvp); vrele(fdvp); vrele(fvp); return (error); abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp != NULL) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } static struct { struct direntry dot; struct direntry dotdot; } dosdirtemplate = { { ". ", /* the . entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ }, { ".. ", /* the .. entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ } }; static int msdosfs_mkdir(struct vop_mkdir_args *ap) { struct componentname *cnp = ap->a_cnp; struct denode *dep; struct denode *pdep = VTODE(ap->a_dvp); struct direntry *denp; struct msdosfsmount *pmp = pdep->de_pmp; struct buf *bp; u_long newcluster, pcl; int bn; int error; struct denode ndirent; struct timespec ts; /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad2; } /* * Allocate a cluster to hold the about to be created directory. */ error = clusteralloc(pmp, 0, 1, CLUST_EOFE, &newcluster, NULL); if (error) goto bad2; memset(&ndirent, 0, sizeof(ndirent)); ndirent.de_pmp = pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; vfs_timestamp(&ts); DETIMES(&ndirent, &ts, &ts, &ts); /* * Now fill the cluster with the "." and ".." entries. And write * the cluster to disk. This way it is there for the parent * directory to be pointing at if there were a crash. */ bn = cntobn(pmp, newcluster); /* always succeeds */ bp = getblk(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, 0, 0); memset(bp->b_data, 0, pmp->pm_bpcluster); memcpy(bp->b_data, &dosdirtemplate, sizeof dosdirtemplate); denp = (struct direntry *)bp->b_data; putushort(denp[0].deStartCluster, newcluster); putushort(denp[0].deCDate, ndirent.de_CDate); putushort(denp[0].deCTime, ndirent.de_CTime); denp[0].deCHundredth = ndirent.de_CHun; putushort(denp[0].deADate, ndirent.de_ADate); putushort(denp[0].deMDate, ndirent.de_MDate); putushort(denp[0].deMTime, ndirent.de_MTime); pcl = pdep->de_StartCluster; /* * Although the root directory has a non-magic starting cluster * number for FAT32, chkdsk and fsck_msdosfs still require * references to it in dotdot entries to be magic. */ if (FAT32(pmp) && pcl == pmp->pm_rootdirblk) pcl = MSDOSFSROOT; putushort(denp[1].deStartCluster, pcl); putushort(denp[1].deCDate, ndirent.de_CDate); putushort(denp[1].deCTime, ndirent.de_CTime); denp[1].deCHundredth = ndirent.de_CHun; putushort(denp[1].deADate, ndirent.de_ADate); putushort(denp[1].deMDate, ndirent.de_MDate); putushort(denp[1].deMTime, ndirent.de_MTime); if (FAT32(pmp)) { putushort(denp[0].deHighClust, newcluster >> 16); putushort(denp[1].deHighClust, pcl >> 16); } if (DOINGASYNC(ap->a_dvp)) bdwrite(bp); else if ((error = bwrite(bp)) != 0) goto bad; /* * Now build up a directory entry pointing to the newly allocated * cluster. This will be written to an empty slot in the parent * directory. */ -#ifdef DIAGNOSTIC - if ((cnp->cn_flags & HASBUF) == 0) - panic("msdosfs_mkdir: no name"); -#endif error = uniqdosname(pdep, cnp, ndirent.de_Name); if (error) goto bad; ndirent.de_Attributes = ATTR_DIRECTORY; ndirent.de_LowerCase = 0; ndirent.de_StartCluster = newcluster; ndirent.de_FileSize = 0; error = createde(&ndirent, pdep, &dep, cnp); if (error) goto bad; *ap->a_vpp = DETOV(dep); return (0); bad: clusterfree(pmp, newcluster); bad2: return (error); } static int msdosfs_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct denode *ip, *dp; int error; ip = VTODE(vp); dp = VTODE(dvp); /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ error = 0; if (!dosdirempty(ip)) { error = ENOTEMPTY; goto out; } /* * Delete the entry from the directory. For dos filesystems this * gets rid of the directory entry on disk, the in memory copy * still exists but the de_refcnt is <= 0. This prevents it from * being found by deget(). When the vput() on dep is done we give * up access and eventually msdosfs_reclaim() will be called which * will remove it from the denode cache. */ error = removede(dp, ip); if (error) goto out; /* * This is where we decrement the link count in the parent * directory. Since dos filesystems don't do this we just purge * the name cache. */ cache_purge(dvp); /* * Truncate the directory that is being deleted. */ error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred); cache_purge(vp); out: return (error); } /* * DOS filesystems don't know what symlinks are. */ static int msdosfs_symlink(struct vop_symlink_args *ap) { return (EOPNOTSUPP); } static int msdosfs_readdir(struct vop_readdir_args *ap) { struct mbnambuf nb; int error = 0; int diff; long n; int blsize; long on; u_long cn; u_long dirsperblk; long bias = 0; daddr_t bn, lbn; struct buf *bp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; struct dirent dirbuf; struct uio *uio = ap->a_uio; uint64_t *cookies = NULL; int ncookies = 0; off_t offset, off; int chksum = -1; #ifdef MSDOSFS_DEBUG printf("msdosfs_readdir(): vp %p, uio %p, cred %p, eofflagp %p\n", ap->a_vp, uio, ap->a_cred, ap->a_eofflag); #endif /* * msdosfs_readdir() won't operate properly on regular files since * it does i/o only with the filesystem vnode, and hence can * retrieve the wrong block from the buffer cache for a plain file. * So, fail attempts to readdir() on a plain file. */ if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) return (ENOTDIR); /* * To be safe, initialize dirbuf */ memset(dirbuf.d_name, 0, sizeof(dirbuf.d_name)); /* * If the user buffer is smaller than the size of one dos directory * entry or the file offset is not a multiple of the size of a * directory entry, then we fail the read. */ off = offset = uio->uio_offset; if (uio->uio_resid < sizeof(struct direntry) || (offset & (sizeof(struct direntry) - 1))) return (EINVAL); if (ap->a_ncookies) { ncookies = uio->uio_resid / 16; cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; *ap->a_ncookies = ncookies; } dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry); /* * If they are reading from the root directory then, we simulate * the . and .. entries since these don't exist in the root * directory. We also set the offset bias to make up for having to * simulate these entries. By this I mean that at file offset 64 we * read the first entry in the root directory that lives on disk. */ if (dep->de_StartCluster == MSDOSFSROOT || (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)) { #if 0 printf("msdosfs_readdir(): going after . or .. in root dir, offset %d\n", offset); #endif bias = 2 * sizeof(struct direntry); if (offset < bias) { for (n = (int)offset / sizeof(struct direntry); n < 2; n++) { dirbuf.d_fileno = FAT32(pmp) ? (uint64_t)cntobn(pmp, pmp->pm_rootdirblk) * dirsperblk : 1; dirbuf.d_type = DT_DIR; switch (n) { case 0: dirbuf.d_namlen = 1; dirbuf.d_name[0] = '.'; break; case 1: dirbuf.d_namlen = 2; dirbuf.d_name[0] = '.'; dirbuf.d_name[1] = '.'; break; } dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf); /* NOTE: d_off is the offset of the *next* entry. */ dirbuf.d_off = offset + sizeof(struct direntry); dirent_terminate(&dirbuf); if (uio->uio_resid < dirbuf.d_reclen) goto out; error = uiomove(&dirbuf, dirbuf.d_reclen, uio); if (error) goto out; offset += sizeof(struct direntry); off = offset; if (cookies) { *cookies++ = offset; if (--ncookies <= 0) goto out; } } } } mbnambuf_init(&nb); off = offset; while (uio->uio_resid > 0) { lbn = de_cluster(pmp, offset - bias); on = (offset - bias) & pmp->pm_crbomask; n = min(pmp->pm_bpcluster - on, uio->uio_resid); diff = dep->de_FileSize - (offset - bias); if (diff <= 0) break; n = min(n, diff); error = pcbmap(dep, lbn, &bn, &cn, &blsize); if (error) break; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { return (error); } n = min(n, blsize - bp->b_resid); if (n == 0) { brelse(bp); return (EIO); } /* * Convert from dos directory entries to fs-independent * directory entries. */ for (dentp = (struct direntry *)(bp->b_data + on); (char *)dentp < bp->b_data + on + n; dentp++, offset += sizeof(struct direntry)) { #if 0 printf("rd: dentp %08x prev %08x crnt %08x deName %02x attr %02x\n", dentp, prev, crnt, dentp->deName[0], dentp->deAttributes); #endif /* * If this is an unused entry, we can stop. */ if (dentp->deName[0] == SLOT_EMPTY) { brelse(bp); goto out; } /* * Skip deleted entries. */ if (dentp->deName[0] == SLOT_DELETED) { chksum = -1; mbnambuf_init(&nb); continue; } /* * Handle Win95 long directory entries */ if (dentp->deAttributes == ATTR_WIN95) { if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) continue; chksum = win2unixfn(&nb, (struct winentry *)dentp, chksum, pmp); continue; } /* * Skip volume labels */ if (dentp->deAttributes & ATTR_VOLUME) { chksum = -1; mbnambuf_init(&nb); continue; } /* * This computation of d_fileno must match * the computation of va_fileid in * msdosfs_getattr. */ if (dentp->deAttributes & ATTR_DIRECTORY) { cn = getushort(dentp->deStartCluster); if (FAT32(pmp)) { cn |= getushort(dentp->deHighClust) << 16; if (cn == MSDOSFSROOT) cn = pmp->pm_rootdirblk; } if (cn == MSDOSFSROOT && !FAT32(pmp)) dirbuf.d_fileno = 1; else dirbuf.d_fileno = cntobn(pmp, cn) * dirsperblk; dirbuf.d_type = DT_DIR; } else { dirbuf.d_fileno = (uoff_t)offset / sizeof(struct direntry); dirbuf.d_type = DT_REG; } if (chksum != winChksum(dentp->deName)) { dirbuf.d_namlen = dos2unixfn(dentp->deName, (u_char *)dirbuf.d_name, dentp->deLowerCase | ((pmp->pm_flags & MSDOSFSMNT_SHORTNAME) ? (LCASE_BASE | LCASE_EXT) : 0), pmp); mbnambuf_init(&nb); } else mbnambuf_flush(&nb, &dirbuf); chksum = -1; dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf); /* NOTE: d_off is the offset of the *next* entry. */ dirbuf.d_off = offset + sizeof(struct direntry); dirent_terminate(&dirbuf); if (uio->uio_resid < dirbuf.d_reclen) { brelse(bp); goto out; } error = uiomove(&dirbuf, dirbuf.d_reclen, uio); if (error) { brelse(bp); goto out; } if (cookies) { *cookies++ = offset + sizeof(struct direntry); if (--ncookies <= 0) { brelse(bp); goto out; } } off = offset + sizeof(struct direntry); } brelse(bp); } out: /* Subtract unused cookies */ if (ap->a_ncookies) *ap->a_ncookies -= ncookies; uio->uio_offset = off; /* * Set the eofflag (NFS uses it) */ if (ap->a_eofflag) { if (dep->de_FileSize - (offset - bias) <= 0) *ap->a_eofflag = 1; else *ap->a_eofflag = 0; } return (error); } /*- * a_vp - pointer to the file's vnode * a_bn - logical block number within the file (cluster number for us) * a_bop - where to return the bufobj of the special file containing the fs * a_bnp - where to return the "physical" block number corresponding to a_bn * (relative to the special file; units are blocks of size DEV_BSIZE) * a_runp - where to return the "run past" a_bn. This is the count of logical * blocks whose physical blocks (together with a_bn's physical block) * are contiguous. * a_runb - where to return the "run before" a_bn. */ static int msdosfs_bmap(struct vop_bmap_args *ap) { struct fatcache savefc; struct denode *dep; struct mount *mp; struct msdosfsmount *pmp; struct vnode *vp; daddr_t runbn; u_long cn; int bnpercn, error, maxio, maxrun, run; vp = ap->a_vp; dep = VTODE(vp); pmp = dep->de_pmp; if (ap->a_bop != NULL) *ap->a_bop = &pmp->pm_devvp->v_bufobj; if (ap->a_bnp == NULL) return (0); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; cn = ap->a_bn; if (cn != ap->a_bn) return (EFBIG); error = pcbmap(dep, cn, ap->a_bnp, NULL, NULL); if (error != 0 || (ap->a_runp == NULL && ap->a_runb == NULL)) return (error); /* * Prepare to back out updates of the fatchain cache after the one * for the first block done by pcbmap() above. Without the backout, * then whenever the caller doesn't do i/o to all of the blocks that * we find, the single useful cache entry would be too far in advance * of the actual i/o to work for the next sequential i/o. Then the * FAT would be searched from the beginning. With the backout, the * FAT is searched starting at most a few blocks early. This wastes * much less time. Time is also wasted finding more blocks than the * caller will do i/o to. This is necessary because the runlength * parameters are output-only. */ savefc = dep->de_fc[FC_LASTMAP]; mp = vp->v_mount; maxio = mp->mnt_iosize_max / mp->mnt_stat.f_iosize; bnpercn = de_cn2bn(pmp, 1); if (ap->a_runp != NULL) { maxrun = ulmin(maxio - 1, pmp->pm_maxcluster - cn); for (run = 1; run <= maxrun; run++) { if (pcbmap(dep, cn + run, &runbn, NULL, NULL) != 0 || runbn != *ap->a_bnp + run * bnpercn) break; } *ap->a_runp = run - 1; } if (ap->a_runb != NULL) { maxrun = ulmin(maxio - 1, cn); for (run = 1; run < maxrun; run++) { if (pcbmap(dep, cn - run, &runbn, NULL, NULL) != 0 || runbn != *ap->a_bnp - run * bnpercn) break; } *ap->a_runb = run - 1; } dep->de_fc[FC_LASTMAP] = savefc; return (0); } SYSCTL_NODE(_vfs, OID_AUTO, msdosfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "msdos filesystem"); static int use_buf_pager = 1; SYSCTL_INT(_vfs_msdosfs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, "Use buffer pager instead of bmap"); static daddr_t msdosfs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { return (de_cluster(VTODE(vp)->de_pmp, off)); } static int msdosfs_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *sz) { *sz = VTODE(vp)->de_pmp->pm_bpcluster; return (0); } static int msdosfs_getpages(struct vop_getpages_args *ap) { if (use_buf_pager) return (vfs_bio_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, msdosfs_gbp_getblkno, msdosfs_gbp_getblksz)); return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); } static int msdosfs_strategy(struct vop_strategy_args *ap) { struct buf *bp = ap->a_bp; struct denode *dep = VTODE(ap->a_vp); struct bufobj *bo; int error = 0; daddr_t blkno; /* * If we don't already know the filesystem relative block number * then get it using pcbmap(). If pcbmap() returns the block * number as -1 then we've got a hole in the file. DOS filesystems * don't allow files with holes, so we shouldn't ever see this. */ if (bp->b_blkno == bp->b_lblkno) { error = pcbmap(dep, bp->b_lblkno, &blkno, 0, 0); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if (bp->b_blkno == -1) { bufdone(bp); return (0); } /* * Read/write the block from/to the disk that contains the desired * file block. */ bp->b_iooffset = dbtob(bp->b_blkno); bo = dep->de_pmp->pm_bo; BO_STRATEGY(bo, bp); return (0); } static int msdosfs_print(struct vop_print_args *ap) { struct denode *dep = VTODE(ap->a_vp); printf("\tstartcluster %lu, dircluster %lu, diroffset %lu, ", dep->de_StartCluster, dep->de_dirclust, dep->de_diroffset); printf("on dev %s\n", devtoname(dep->de_pmp->pm_dev)); return (0); } static int msdosfs_pathconf(struct vop_pathconf_args *ap) { struct msdosfsmount *pmp = VTODE(ap->a_vp)->de_pmp; switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 32; return (0); case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: *ap->a_retval = pmp->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } static int msdosfs_vptofh(struct vop_vptofh_args *ap) { struct denode *dep; struct defid *defhp; dep = VTODE(ap->a_vp); defhp = (struct defid *)ap->a_fhp; defhp->defid_len = sizeof(struct defid); defhp->defid_dirclust = dep->de_dirclust; defhp->defid_dirofs = dep->de_diroffset; /* defhp->defid_gen = dep->de_gen; */ return (0); } /* Global vfs data structures for msdosfs */ struct vop_vector msdosfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = msdosfs_access, .vop_bmap = msdosfs_bmap, .vop_getpages = msdosfs_getpages, .vop_cachedlookup = msdosfs_lookup, .vop_open = msdosfs_open, .vop_close = msdosfs_close, .vop_create = msdosfs_create, .vop_fsync = msdosfs_fsync, .vop_fdatasync = vop_stdfdatasync_buf, .vop_getattr = msdosfs_getattr, .vop_inactive = msdosfs_inactive, .vop_link = msdosfs_link, .vop_lookup = vfs_cache_lookup, .vop_mkdir = msdosfs_mkdir, .vop_mknod = msdosfs_mknod, .vop_pathconf = msdosfs_pathconf, .vop_print = msdosfs_print, .vop_read = msdosfs_read, .vop_readdir = msdosfs_readdir, .vop_reclaim = msdosfs_reclaim, .vop_remove = msdosfs_remove, .vop_rename = msdosfs_rename, .vop_rmdir = msdosfs_rmdir, .vop_setattr = msdosfs_setattr, .vop_strategy = msdosfs_strategy, .vop_symlink = msdosfs_symlink, .vop_write = msdosfs_write, .vop_vptofh = msdosfs_vptofh, }; VFS_VOP_VECTOR_REGISTER(msdosfs_vnodeops); diff --git a/sys/fs/nfsclient/nfs_clvnops.c b/sys/fs/nfsclient/nfs_clvnops.c index b1a174f171fa..817c1093374c 100644 --- a/sys/fs/nfsclient/nfs_clvnops.c +++ b/sys/fs/nfsclient/nfs_clvnops.c @@ -1,4552 +1,4538 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from nfs_vnops.c 8.16 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); /* * vnode op calls for Sun NFS version 2, 3 and 4 */ #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_nfsclient_accesscache_flush_probe_func_t dtrace_nfscl_accesscache_flush_done_probe; uint32_t nfscl_accesscache_flush_done_id; dtrace_nfsclient_accesscache_get_probe_func_t dtrace_nfscl_accesscache_get_hit_probe, dtrace_nfscl_accesscache_get_miss_probe; uint32_t nfscl_accesscache_get_hit_id; uint32_t nfscl_accesscache_get_miss_id; dtrace_nfsclient_accesscache_load_probe_func_t dtrace_nfscl_accesscache_load_done_probe; uint32_t nfscl_accesscache_load_done_id; #endif /* !KDTRACE_HOOKS */ /* Defs */ #define TRUE 1 #define FALSE 0 extern struct nfsstatsv1 nfsstatsv1; extern int nfsrv_useacl; extern int nfscl_debuglevel; MALLOC_DECLARE(M_NEWNFSREQ); static vop_read_t nfsfifo_read; static vop_write_t nfsfifo_write; static vop_close_t nfsfifo_close; static int nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *, struct thread *); static vop_lookup_t nfs_lookup; static vop_create_t nfs_create; static vop_mknod_t nfs_mknod; static vop_open_t nfs_open; static vop_pathconf_t nfs_pathconf; static vop_close_t nfs_close; static vop_access_t nfs_access; static vop_getattr_t nfs_getattr; static vop_setattr_t nfs_setattr; static vop_read_t nfs_read; static vop_fsync_t nfs_fsync; static vop_remove_t nfs_remove; static vop_link_t nfs_link; static vop_rename_t nfs_rename; static vop_mkdir_t nfs_mkdir; static vop_rmdir_t nfs_rmdir; static vop_symlink_t nfs_symlink; static vop_readdir_t nfs_readdir; static vop_strategy_t nfs_strategy; static int nfs_lookitup(struct vnode *, char *, int, struct ucred *, struct thread *, struct nfsnode **); static int nfs_sillyrename(struct vnode *, struct vnode *, struct componentname *); static vop_access_t nfsspec_access; static vop_readlink_t nfs_readlink; static vop_print_t nfs_print; static vop_advlock_t nfs_advlock; static vop_advlockasync_t nfs_advlockasync; static vop_getacl_t nfs_getacl; static vop_setacl_t nfs_setacl; static vop_advise_t nfs_advise; static vop_allocate_t nfs_allocate; static vop_deallocate_t nfs_deallocate; static vop_copy_file_range_t nfs_copy_file_range; static vop_ioctl_t nfs_ioctl; static vop_getextattr_t nfs_getextattr; static vop_setextattr_t nfs_setextattr; static vop_listextattr_t nfs_listextattr; static vop_deleteextattr_t nfs_deleteextattr; static vop_lock1_t nfs_lock; /* * Global vfs data structures for nfs */ static struct vop_vector newnfs_vnodeops_nosig = { .vop_default = &default_vnodeops, .vop_access = nfs_access, .vop_advlock = nfs_advlock, .vop_advlockasync = nfs_advlockasync, .vop_close = nfs_close, .vop_create = nfs_create, .vop_fsync = nfs_fsync, .vop_getattr = nfs_getattr, .vop_getpages = ncl_getpages, .vop_putpages = ncl_putpages, .vop_inactive = ncl_inactive, .vop_link = nfs_link, .vop_lock1 = nfs_lock, .vop_lookup = nfs_lookup, .vop_mkdir = nfs_mkdir, .vop_mknod = nfs_mknod, .vop_open = nfs_open, .vop_pathconf = nfs_pathconf, .vop_print = nfs_print, .vop_read = nfs_read, .vop_readdir = nfs_readdir, .vop_readlink = nfs_readlink, .vop_reclaim = ncl_reclaim, .vop_remove = nfs_remove, .vop_rename = nfs_rename, .vop_rmdir = nfs_rmdir, .vop_setattr = nfs_setattr, .vop_strategy = nfs_strategy, .vop_symlink = nfs_symlink, .vop_write = ncl_write, .vop_getacl = nfs_getacl, .vop_setacl = nfs_setacl, .vop_advise = nfs_advise, .vop_allocate = nfs_allocate, .vop_deallocate = nfs_deallocate, .vop_copy_file_range = nfs_copy_file_range, .vop_ioctl = nfs_ioctl, .vop_getextattr = nfs_getextattr, .vop_setextattr = nfs_setextattr, .vop_listextattr = nfs_listextattr, .vop_deleteextattr = nfs_deleteextattr, }; VFS_VOP_VECTOR_REGISTER(newnfs_vnodeops_nosig); static int nfs_vnodeops_bypass(struct vop_generic_args *a) { return (vop_sigdefer(&newnfs_vnodeops_nosig, a)); } struct vop_vector newnfs_vnodeops = { .vop_default = &default_vnodeops, .vop_bypass = nfs_vnodeops_bypass, }; VFS_VOP_VECTOR_REGISTER(newnfs_vnodeops); static struct vop_vector newnfs_fifoops_nosig = { .vop_default = &fifo_specops, .vop_access = nfsspec_access, .vop_close = nfsfifo_close, .vop_fsync = nfs_fsync, .vop_getattr = nfs_getattr, .vop_inactive = ncl_inactive, .vop_pathconf = nfs_pathconf, .vop_print = nfs_print, .vop_read = nfsfifo_read, .vop_reclaim = ncl_reclaim, .vop_setattr = nfs_setattr, .vop_write = nfsfifo_write, }; VFS_VOP_VECTOR_REGISTER(newnfs_fifoops_nosig); static int nfs_fifoops_bypass(struct vop_generic_args *a) { return (vop_sigdefer(&newnfs_fifoops_nosig, a)); } struct vop_vector newnfs_fifoops = { .vop_default = &default_vnodeops, .vop_bypass = nfs_fifoops_bypass, }; VFS_VOP_VECTOR_REGISTER(newnfs_fifoops); static int nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap); static int nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name, int namelen, struct ucred *cred, struct thread *td); static int nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr, int fnamelen, struct vnode *tdvp, struct vnode *tvp, char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td); static int nfs_renameit(struct vnode *sdvp, struct vnode *svp, struct componentname *scnp, struct sillyrename *sp); /* * Global variables */ SYSCTL_DECL(_vfs_nfs); static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO; SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW, &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout"); static int nfs_prime_access_cache = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, prime_access_cache, CTLFLAG_RW, &nfs_prime_access_cache, 0, "Prime NFS ACCESS cache when fetching attributes"); static int newnfs_commit_on_close = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, commit_on_close, CTLFLAG_RW, &newnfs_commit_on_close, 0, "write+commit on close, else only write"); static int nfs_clean_pages_on_close = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW, &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close"); int newnfs_directio_enable = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW, &newnfs_directio_enable, 0, "Enable NFS directio"); int nfs_keep_dirty_on_error; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_keep_dirty_on_error, CTLFLAG_RW, &nfs_keep_dirty_on_error, 0, "Retry pageout if error returned"); /* * This sysctl allows other processes to mmap a file that has been opened * O_DIRECT by a process. In general, having processes mmap the file while * Direct IO is in progress can lead to Data Inconsistencies. But, we allow * this by default to prevent DoS attacks - to prevent a malicious user from * opening up files O_DIRECT preventing other users from mmap'ing these * files. "Protected" environments where stricter consistency guarantees are * required can disable this knob. The process that opened the file O_DIRECT * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not * meaningful. */ int newnfs_directio_allow_mmap = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW, &newnfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens"); static uint64_t nfs_maxalloclen = 64 * 1024 * 1024; SYSCTL_U64(_vfs_nfs, OID_AUTO, maxalloclen, CTLFLAG_RW, &nfs_maxalloclen, 0, "NFS max allocate/deallocate length"); #define NFSACCESS_ALL (NFSACCESS_READ | NFSACCESS_MODIFY \ | NFSACCESS_EXTEND | NFSACCESS_EXECUTE \ | NFSACCESS_DELETE | NFSACCESS_LOOKUP) /* * SMP Locking Note : * The list of locks after the description of the lock is the ordering * of other locks acquired with the lock held. * np->n_mtx : Protects the fields in the nfsnode. VM Object Lock VI_MTX (acquired indirectly) * nmp->nm_mtx : Protects the fields in the nfsmount. rep->r_mtx * ncl_iod_mutex : Global lock, protects shared nfsiod state. * nfs_reqq_mtx : Global lock, protects the nfs_reqq list. nmp->nm_mtx rep->r_mtx * rep->r_mtx : Protects the fields in an nfsreq. */ static int nfs_lock(struct vop_lock1_args *ap) { struct vnode *vp; struct nfsnode *np; u_quad_t nsize; int error, lktype; bool onfault; vp = ap->a_vp; lktype = ap->a_flags & LK_TYPE_MASK; error = VOP_LOCK1_APV(&default_vnodeops, ap); if (error != 0 || vp->v_op != &newnfs_vnodeops) return (error); np = VTONFS(vp); if (np == NULL) return (0); NFSLOCKNODE(np); if ((np->n_flag & NVNSETSZSKIP) == 0 || (lktype != LK_SHARED && lktype != LK_EXCLUSIVE && lktype != LK_UPGRADE && lktype != LK_TRYUPGRADE)) { NFSUNLOCKNODE(np); return (0); } onfault = (ap->a_flags & LK_EATTR_MASK) == LK_NOWAIT && (ap->a_flags & LK_INIT_MASK) == LK_CANRECURSE && (lktype == LK_SHARED || lktype == LK_EXCLUSIVE); if (onfault && vp->v_vnlock->lk_recurse == 0) { /* * Force retry in vm_fault(), to make the lock request * sleepable, which allows us to piggy-back the * sleepable call to vnode_pager_setsize(). */ NFSUNLOCKNODE(np); VOP_UNLOCK(vp); return (EBUSY); } if ((ap->a_flags & LK_NOWAIT) != 0 || (lktype == LK_SHARED && vp->v_vnlock->lk_recurse > 0)) { NFSUNLOCKNODE(np); return (0); } if (lktype == LK_SHARED) { NFSUNLOCKNODE(np); VOP_UNLOCK(vp); ap->a_flags &= ~(LK_TYPE_MASK | LK_INTERLOCK); ap->a_flags |= LK_EXCLUSIVE; error = VOP_LOCK1_APV(&default_vnodeops, ap); if (error != 0 || vp->v_op != &newnfs_vnodeops) return (error); if (vp->v_data == NULL) goto downgrade; MPASS(vp->v_data == np); NFSLOCKNODE(np); if ((np->n_flag & NVNSETSZSKIP) == 0) { NFSUNLOCKNODE(np); goto downgrade; } } np->n_flag &= ~NVNSETSZSKIP; nsize = np->n_size; NFSUNLOCKNODE(np); vnode_pager_setsize(vp, nsize); downgrade: if (lktype == LK_SHARED) { ap->a_flags &= ~(LK_TYPE_MASK | LK_INTERLOCK); ap->a_flags |= LK_DOWNGRADE; (void)VOP_LOCK1_APV(&default_vnodeops, ap); } return (0); } static int nfs34_access_otw(struct vnode *vp, int wmode, struct thread *td, struct ucred *cred, u_int32_t *retmode) { int error = 0, attrflag, i, lrupos; u_int32_t rmode; struct nfsnode *np = VTONFS(vp); struct nfsvattr nfsva; error = nfsrpc_accessrpc(vp, wmode, cred, td, &nfsva, &attrflag, &rmode); if (attrflag) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (!error) { lrupos = 0; NFSLOCKNODE(np); for (i = 0; i < NFS_ACCESSCACHESIZE; i++) { if (np->n_accesscache[i].uid == cred->cr_uid) { np->n_accesscache[i].mode = rmode; np->n_accesscache[i].stamp = time_second; break; } if (i > 0 && np->n_accesscache[i].stamp < np->n_accesscache[lrupos].stamp) lrupos = i; } if (i == NFS_ACCESSCACHESIZE) { np->n_accesscache[lrupos].uid = cred->cr_uid; np->n_accesscache[lrupos].mode = rmode; np->n_accesscache[lrupos].stamp = time_second; } NFSUNLOCKNODE(np); if (retmode != NULL) *retmode = rmode; KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, rmode, 0); } else if (NFS_ISV4(vp)) { error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); } #ifdef KDTRACE_HOOKS if (error != 0) KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, 0, error); #endif return (error); } /* * nfs access vnode op. * For nfs version 2, just return ok. File accesses may fail later. * For nfs version 3, use the access rpc to check accessibility. If file modes * are changed on the server, accesses might still fail later. */ static int nfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; int error = 0, i, gotahit; u_int32_t mode, wmode, rmode; int v34 = NFS_ISV34(vp); struct nfsnode *np = VTONFS(vp); /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((ap->a_accmode & (VWRITE | VAPPEND | VWRITE_NAMED_ATTRS | VDELETE_CHILD | VWRITE_ATTRIBUTES | VDELETE | VWRITE_ACL | VWRITE_OWNER)) != 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * For nfs v3 or v4, check to see if we have done this recently, and if * so return our cached result instead of making an ACCESS call. * If not, do an access rpc, otherwise you are stuck emulating * ufs_access() locally using the vattr. This may not be correct, * since the server may apply other access criteria such as * client uid-->server uid mapping that we do not know about. */ if (v34) { if (ap->a_accmode & VREAD) mode = NFSACCESS_READ; else mode = 0; if (vp->v_type != VDIR) { if (ap->a_accmode & VWRITE) mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND); if (ap->a_accmode & VAPPEND) mode |= NFSACCESS_EXTEND; if (ap->a_accmode & VEXEC) mode |= NFSACCESS_EXECUTE; if (ap->a_accmode & VDELETE) mode |= NFSACCESS_DELETE; } else { if (ap->a_accmode & VWRITE) mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND); if (ap->a_accmode & VAPPEND) mode |= NFSACCESS_EXTEND; if (ap->a_accmode & VEXEC) mode |= NFSACCESS_LOOKUP; if (ap->a_accmode & VDELETE) mode |= NFSACCESS_DELETE; if (ap->a_accmode & VDELETE_CHILD) mode |= NFSACCESS_MODIFY; } /* XXX safety belt, only make blanket request if caching */ if (nfsaccess_cache_timeout > 0) { wmode = NFSACCESS_READ | NFSACCESS_MODIFY | NFSACCESS_EXTEND | NFSACCESS_EXECUTE | NFSACCESS_DELETE | NFSACCESS_LOOKUP; } else { wmode = mode; } /* * Does our cached result allow us to give a definite yes to * this request? */ gotahit = 0; NFSLOCKNODE(np); for (i = 0; i < NFS_ACCESSCACHESIZE; i++) { if (ap->a_cred->cr_uid == np->n_accesscache[i].uid) { if (time_second < (np->n_accesscache[i].stamp + nfsaccess_cache_timeout) && (np->n_accesscache[i].mode & mode) == mode) { NFSINCRGLOBAL(nfsstatsv1.accesscache_hits); gotahit = 1; } break; } } NFSUNLOCKNODE(np); #ifdef KDTRACE_HOOKS if (gotahit != 0) KDTRACE_NFS_ACCESSCACHE_GET_HIT(vp, ap->a_cred->cr_uid, mode); else KDTRACE_NFS_ACCESSCACHE_GET_MISS(vp, ap->a_cred->cr_uid, mode); #endif if (gotahit == 0) { /* * Either a no, or a don't know. Go to the wire. */ NFSINCRGLOBAL(nfsstatsv1.accesscache_misses); error = nfs34_access_otw(vp, wmode, ap->a_td, ap->a_cred, &rmode); if (!error && (rmode & mode) != mode) error = EACCES; } return (error); } else { if ((error = nfsspec_access(ap)) != 0) { return (error); } /* * Attempt to prevent a mapped root from accessing a file * which it shouldn't. We try to read a byte from the file * if the user is root and the file is not zero length. * After calling nfsspec_access, we should have the correct * file size cached. */ NFSLOCKNODE(np); if (ap->a_cred->cr_uid == 0 && (ap->a_accmode & VREAD) && VTONFS(vp)->n_size > 0) { struct iovec aiov; struct uio auio; char buf[1]; NFSUNLOCKNODE(np); aiov.iov_base = buf; aiov.iov_len = 1; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_resid = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = ap->a_td; if (vp->v_type == VREG) error = ncl_readrpc(vp, &auio, ap->a_cred); else if (vp->v_type == VDIR) { char* bp; bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK); aiov.iov_base = bp; aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ; error = ncl_readdirrpc(vp, &auio, ap->a_cred, ap->a_td); free(bp, M_TEMP); } else if (vp->v_type == VLNK) error = ncl_readlinkrpc(vp, &auio, ap->a_cred); else error = EACCES; } else NFSUNLOCKNODE(np); return (error); } } /* * nfs open vnode op * Check to see if the type is ok * and that deletion is not in progress. * For paged in text files, you will need to flush the page cache * if consistency is lost. */ /* ARGSUSED */ static int nfs_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct vattr vattr; int error; int fmode = ap->a_mode; struct ucred *cred; vm_object_t obj; if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) return (EOPNOTSUPP); /* * For NFSv4, we need to do the Open Op before cache validation, * so that we conform to RFC3530 Sec. 9.3.1. */ if (NFS_ISV4(vp)) { error = nfsrpc_open(vp, fmode, ap->a_cred, ap->a_td); if (error) { error = nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0); return (error); } } /* * Now, if this Open will be doing reading, re-validate/flush the * cache, so that Close/Open coherency is maintained. */ NFSLOCKNODE(np); if (np->n_flag & NMODIFIED) { NFSUNLOCKNODE(np); if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(vp)) return (EBADF); } error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error == EINTR || error == EIO) { if (NFS_ISV4(vp)) (void) nfsrpc_close(vp, 0, ap->a_td); return (error); } NFSLOCKNODE(np); np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); if (vp->v_type == VDIR) np->n_direofoffset = 0; NFSUNLOCKNODE(np); error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) { if (NFS_ISV4(vp)) (void) nfsrpc_close(vp, 0, ap->a_td); return (error); } NFSLOCKNODE(np); np->n_mtime = vattr.va_mtime; if (NFS_ISV4(vp)) np->n_change = vattr.va_filerev; } else { NFSUNLOCKNODE(np); error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) { if (NFS_ISV4(vp)) (void) nfsrpc_close(vp, 0, ap->a_td); return (error); } NFSLOCKNODE(np); if ((NFS_ISV4(vp) && np->n_change != vattr.va_filerev) || NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) { if (vp->v_type == VDIR) np->n_direofoffset = 0; NFSUNLOCKNODE(np); if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(vp)) return (EBADF); } error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error == EINTR || error == EIO) { if (NFS_ISV4(vp)) (void) nfsrpc_close(vp, 0, ap->a_td); return (error); } NFSLOCKNODE(np); np->n_mtime = vattr.va_mtime; if (NFS_ISV4(vp)) np->n_change = vattr.va_filerev; } } /* * If the object has >= 1 O_DIRECT active opens, we disable caching. */ if (newnfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) { if (np->n_directio_opens == 0) { NFSUNLOCKNODE(np); if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(vp)) return (EBADF); } error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error) { if (NFS_ISV4(vp)) (void) nfsrpc_close(vp, 0, ap->a_td); return (error); } NFSLOCKNODE(np); np->n_flag |= NNONCACHE; } np->n_directio_opens++; } /* If opened for writing via NFSv4.1 or later, mark that for pNFS. */ if (NFSHASPNFS(VFSTONFS(vp->v_mount)) && (fmode & FWRITE) != 0) np->n_flag |= NWRITEOPENED; /* * If this is an open for writing, capture a reference to the * credentials, so they can be used by ncl_putpages(). Using * these write credentials is preferable to the credentials of * whatever thread happens to be doing the VOP_PUTPAGES() since * the write RPCs are less likely to fail with EACCES. */ if ((fmode & FWRITE) != 0) { cred = np->n_writecred; np->n_writecred = crhold(ap->a_cred); } else cred = NULL; NFSUNLOCKNODE(np); if (cred != NULL) crfree(cred); vnode_create_vobject(vp, vattr.va_size, ap->a_td); /* * If the text file has been mmap'd, flush any dirty pages to the * buffer cache and then... * Make sure all writes are pushed to the NFS server. If this is not * done, the modify time of the file can change while the text * file is being executed. This will cause the process that is * executing the text file to be terminated. */ if (vp->v_writecount <= -1) { if ((obj = vp->v_object) != NULL && vm_object_mightbedirty(obj)) { if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(vp)) return (EBADF); } VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(obj); } /* Now, flush the buffer cache. */ ncl_flush(vp, MNT_WAIT, curthread, 0, 0); /* And, finally, make sure that n_mtime is up to date. */ np = VTONFS(vp); NFSLOCKNODE(np); np->n_mtime = np->n_vattr.na_mtime; NFSUNLOCKNODE(np); } return (0); } /* * nfs close vnode op * What an NFS client should do upon close after writing is a debatable issue. * Most NFS clients push delayed writes to the server upon close, basically for * two reasons: * 1 - So that any write errors may be reported back to the client process * doing the close system call. By far the two most likely errors are * NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure. * 2 - To put a worst case upper bound on cache inconsistency between * multiple clients for the file. * There is also a consistency problem for Version 2 of the protocol w.r.t. * not being able to tell if other clients are writing a file concurrently, * since there is no way of knowing if the changed modify time in the reply * is only due to the write for this client. * (NFS Version 3 provides weak cache consistency data in the reply that * should be sufficient to detect and handle this case.) * * The current code does the following: * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers * for NFS Version 3 - flush dirty buffers to the server but don't invalidate * or commit them (this satisfies 1 and 2 except for the * case where the server crashes after this close but * before the commit RPC, which is felt to be "good * enough". Changing the last argument to ncl_flush() to * a 1 would force a commit operation, if it is felt a * commit is necessary now. * for NFS Version 4 - flush the dirty buffers and commit them, if * nfscl_mustflush() says this is necessary. * It is necessary if there is no write delegation held, * in order to satisfy open/close coherency. * If the file isn't cached on local stable storage, * it may be necessary in order to detect "out of space" * errors from the server, if the write delegation * issued by the server doesn't allow the file to grow. */ /* ARGSUSED */ static int nfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct nfsvattr nfsva; struct ucred *cred; int error = 0, ret, localcred = 0; int fmode = ap->a_fflag; if (NFSCL_FORCEDISM(vp->v_mount)) return (0); /* * During shutdown, a_cred isn't valid, so just use root. */ if (ap->a_cred == NOCRED) { cred = newnfs_getcred(); localcred = 1; } else { cred = ap->a_cred; } if (vp->v_type == VREG) { /* * Examine and clean dirty pages, regardless of NMODIFIED. * This closes a major hole in close-to-open consistency. * We want to push out all dirty pages (and buffers) on * close, regardless of whether they were dirtied by * mmap'ed writes or via write(). */ if (nfs_clean_pages_on_close && vp->v_object) { if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(vp) && ap->a_fflag != FNONBLOCK) return (EBADF); } VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } NFSLOCKNODE(np); if (np->n_flag & NMODIFIED) { NFSUNLOCKNODE(np); if (NFS_ISV3(vp)) { /* * Under NFSv3 we have dirty buffers to dispose of. We * must flush them to the NFS server. We have the option * of waiting all the way through the commit rpc or just * waiting for the initial write. The default is to only * wait through the initial write so the data is in the * server's cache, which is roughly similar to the state * a standard disk subsystem leaves the file in on close(). * * We cannot clear the NMODIFIED bit in np->n_flag due to * potential races with other processes, and certainly * cannot clear it if we don't commit. * These races occur when there is no longer the old * traditional vnode locking implemented for Vnode Ops. */ int cm = newnfs_commit_on_close ? 1 : 0; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(vp) && ap->a_fflag != FNONBLOCK) return (EBADF); } error = ncl_flush(vp, MNT_WAIT, ap->a_td, cm, 0); /* np->n_flag &= ~NMODIFIED; */ } else if (NFS_ISV4(vp)) { if (nfscl_mustflush(vp) != 0) { int cm = newnfs_commit_on_close ? 1 : 0; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(vp) && ap->a_fflag != FNONBLOCK) return (EBADF); } error = ncl_flush(vp, MNT_WAIT, ap->a_td, cm, 0); /* * as above w.r.t races when clearing * NMODIFIED. * np->n_flag &= ~NMODIFIED; */ } } else { if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(vp) && ap->a_fflag != FNONBLOCK) return (EBADF); } error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1); } NFSLOCKNODE(np); } /* * Invalidate the attribute cache in all cases. * An open is going to fetch fresh attrs any way, other procs * on this node that have file open will be forced to do an * otw attr fetch, but this is safe. * --> A user found that their RPC count dropped by 20% when * this was commented out and I can't see any requirement * for it, so I've disabled it when negative lookups are * enabled. (What does this have to do with negative lookup * caching? Well nothing, except it was reported by the * same user that needed negative lookup caching and I wanted * there to be a way to disable it to see if it * is the cause of some caching/coherency issue that might * crop up.) */ if (VFSTONFS(vp->v_mount)->nm_negnametimeo == 0) { np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); } if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; error = np->n_error; } NFSUNLOCKNODE(np); } if (NFS_ISV4(vp)) { /* * Get attributes so "change" is up to date. */ if (error == 0 && nfscl_mustflush(vp) != 0 && vp->v_type == VREG && (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOCTO) == 0) { ret = nfsrpc_getattr(vp, cred, ap->a_td, &nfsva); if (!ret) { np->n_change = nfsva.na_filerev; (void) nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 0); } } /* * and do the close. */ ret = nfsrpc_close(vp, 0, ap->a_td); if (!error && ret) error = ret; if (error) error = nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0); } if (newnfs_directio_enable) KASSERT((np->n_directio_asyncwr == 0), ("nfs_close: dirty unflushed (%d) directio buffers\n", np->n_directio_asyncwr)); if (newnfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) { NFSLOCKNODE(np); KASSERT((np->n_directio_opens > 0), ("nfs_close: unexpectedly value (0) of n_directio_opens\n")); np->n_directio_opens--; if (np->n_directio_opens == 0) np->n_flag &= ~NNONCACHE; NFSUNLOCKNODE(np); } if (localcred) NFSFREECRED(cred); return (error); } /* * nfs getattr call from vfs. */ static int nfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = curthread; /* XXX */ struct nfsnode *np = VTONFS(vp); int error = 0; struct nfsvattr nfsva; struct vattr *vap = ap->a_vap; struct vattr vattr; /* * Update local times for special files. */ NFSLOCKNODE(np); if (np->n_flag & (NACC | NUPD)) np->n_flag |= NCHG; NFSUNLOCKNODE(np); /* * First look in the cache. */ if (ncl_getattrcache(vp, &vattr) == 0) { ncl_copy_vattr(vap, &vattr); /* * Get the local modify time for the case of a write * delegation. */ nfscl_deleggetmodtime(vp, &vap->va_mtime); return (0); } if (NFS_ISV34(vp) && nfs_prime_access_cache && nfsaccess_cache_timeout > 0) { NFSINCRGLOBAL(nfsstatsv1.accesscache_misses); nfs34_access_otw(vp, NFSACCESS_ALL, td, ap->a_cred, NULL); if (ncl_getattrcache(vp, ap->a_vap) == 0) { nfscl_deleggetmodtime(vp, &ap->a_vap->va_mtime); return (0); } } error = nfsrpc_getattr(vp, ap->a_cred, td, &nfsva); if (!error) error = nfscl_loadattrcache(&vp, &nfsva, vap, 0, 0); if (!error) { /* * Get the local modify time for the case of a write * delegation. */ nfscl_deleggetmodtime(vp, &vap->va_mtime); } else if (NFS_ISV4(vp)) { error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); } return (error); } /* * nfs setattr call. */ static int nfs_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct thread *td = curthread; /* XXX */ struct vattr *vap = ap->a_vap; int error = 0; u_quad_t tsize; struct timespec ts; #ifndef nolint tsize = (u_quad_t)0; #endif /* * Setting of flags and marking of atimes are not supported. */ if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: if (vap->va_mtime.tv_sec == VNOVAL && vap->va_atime.tv_sec == VNOVAL && vap->va_birthtime.tv_sec == VNOVAL && vap->va_mode == (mode_t)VNOVAL && vap->va_uid == (uid_t)VNOVAL && vap->va_gid == (gid_t)VNOVAL) return (0); vap->va_size = VNOVAL; break; default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * We run vnode_pager_setsize() early (why?), * we must set np->n_size now to avoid vinvalbuf * V_SAVE races that might setsize a lower * value. */ NFSLOCKNODE(np); tsize = np->n_size; NFSUNLOCKNODE(np); error = ncl_meta_setsize(vp, td, vap->va_size); NFSLOCKNODE(np); if (np->n_flag & NMODIFIED) { tsize = np->n_size; NFSUNLOCKNODE(np); error = ncl_vinvalbuf(vp, vap->va_size == 0 ? 0 : V_SAVE, td, 1); if (error != 0) { vnode_pager_setsize(vp, tsize); return (error); } /* * Call nfscl_delegmodtime() to set the modify time * locally, as required. */ nfscl_delegmodtime(vp); } else NFSUNLOCKNODE(np); /* * np->n_size has already been set to vap->va_size * in ncl_meta_setsize(). We must set it again since * nfs_loadattrcache() could be called through * ncl_meta_setsize() and could modify np->n_size. */ NFSLOCKNODE(np); np->n_vattr.na_size = np->n_size = vap->va_size; NFSUNLOCKNODE(np); } } else { NFSLOCKNODE(np); if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) && vp->v_type == VREG) { NFSUNLOCKNODE(np); error = ncl_vinvalbuf(vp, V_SAVE, td, 1); if (error == EINTR || error == EIO) return (error); } else NFSUNLOCKNODE(np); } error = nfs_setattrrpc(vp, vap, ap->a_cred, td); if (vap->va_size != VNOVAL) { if (error == 0) { nanouptime(&ts); NFSLOCKNODE(np); np->n_localmodtime = ts; NFSUNLOCKNODE(np); } else { NFSLOCKNODE(np); np->n_size = np->n_vattr.na_size = tsize; vnode_pager_setsize(vp, tsize); NFSUNLOCKNODE(np); } } return (error); } /* * Do an nfs setattr rpc. */ static int nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { struct nfsnode *np = VTONFS(vp); int error, ret, attrflag, i; struct nfsvattr nfsva; if (NFS_ISV34(vp)) { NFSLOCKNODE(np); for (i = 0; i < NFS_ACCESSCACHESIZE; i++) np->n_accesscache[i].stamp = 0; np->n_flag |= NDELEGMOD; NFSUNLOCKNODE(np); KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp); } error = nfsrpc_setattr(vp, vap, NULL, cred, td, &nfsva, &attrflag); if (attrflag) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (ret && !error) error = ret; } if (error && NFS_ISV4(vp)) error = nfscl_maperr(td, error, vap->va_uid, vap->va_gid); return (error); } /* * nfs lookup call, one step at a time... * First look in cache * If not found, unlock the directory nfsnode and do the rpc */ static int nfs_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct mount *mp = dvp->v_mount; int flags = cnp->cn_flags; struct vnode *newvp; struct nfsmount *nmp; struct nfsnode *np, *newnp; int error = 0, attrflag, dattrflag, ltype, ncticks; struct thread *td = curthread; struct nfsfh *nfhp; struct nfsvattr dnfsva, nfsva; struct vattr vattr; struct timespec nctime, ts; uint32_t openmode; *vpp = NULLVP; if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); if (dvp->v_type != VDIR) return (ENOTDIR); nmp = VFSTONFS(mp); np = VTONFS(dvp); /* For NFSv4, wait until any remove is done. */ NFSLOCKNODE(np); while (NFSHASNFSV4(nmp) && (np->n_flag & NREMOVEINPROG)) { np->n_flag |= NREMOVEWANT; (void) msleep((caddr_t)np, &np->n_mtx, PZERO, "nfslkup", 0); } NFSUNLOCKNODE(np); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); error = cache_lookup(dvp, vpp, cnp, &nctime, &ncticks); if (error > 0 && error != ENOENT) return (error); if (error == -1) { /* * Lookups of "." are special and always return the * current directory. cache_lookup() already handles * associated locking bookkeeping, etc. */ if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { - /* XXX: Is this really correct? */ - if (cnp->cn_nameiop != LOOKUP && - (flags & ISLASTCN)) - cnp->cn_flags |= SAVENAME; return (0); } /* * We only accept a positive hit in the cache if the * change time of the file matches our cached copy. * Otherwise, we discard the cache entry and fallback * to doing a lookup RPC. We also only trust cache * entries for less than nm_nametimeo seconds. * * To better handle stale file handles and attributes, * clear the attribute cache of this node if it is a * leaf component, part of an open() call, and not * locally modified before fetching the attributes. * This should allow stale file handles to be detected * here where we can fall back to a LOOKUP RPC to * recover rather than having nfs_open() detect the * stale file handle and failing open(2) with ESTALE. */ newvp = *vpp; newnp = VTONFS(newvp); if (!(nmp->nm_flag & NFSMNT_NOCTO) && (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) && !(newnp->n_flag & NMODIFIED)) { NFSLOCKNODE(newnp); newnp->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp); NFSUNLOCKNODE(newnp); } if (nfscl_nodeleg(newvp, 0) == 0 || ((u_int)(ticks - ncticks) < (nmp->nm_nametimeo * hz) && VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 && timespeccmp(&vattr.va_ctime, &nctime, ==))) { NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits); - if (cnp->cn_nameiop != LOOKUP && - (flags & ISLASTCN)) - cnp->cn_flags |= SAVENAME; return (0); } cache_purge(newvp); if (dvp != newvp) vput(newvp); else vrele(newvp); *vpp = NULLVP; } else if (error == ENOENT) { if (VN_IS_DOOMED(dvp)) return (ENOENT); /* * We only accept a negative hit in the cache if the * modification time of the parent directory matches * the cached copy in the name cache entry. * Otherwise, we discard all of the negative cache * entries for this directory. We also only trust * negative cache entries for up to nm_negnametimeo * seconds. */ if ((u_int)(ticks - ncticks) < (nmp->nm_negnametimeo * hz) && VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 && timespeccmp(&vattr.va_mtime, &nctime, ==)) { NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits); return (ENOENT); } cache_purge_negative(dvp); } openmode = 0; /* * If this an NFSv4.1/4.2 mount using the "oneopenown" mount * option, it is possible to do the Open operation in the same * compound as Lookup, so long as delegations are not being * issued. This saves doing a separate RPC for Open. * For pnfs, do not do this, since the Open+LayoutGet will * be needed as a separate RPC. */ NFSLOCKMNT(nmp); if (NFSHASNFSV4N(nmp) && NFSHASONEOPENOWN(nmp) && !NFSHASPNFS(nmp) && (nmp->nm_privflag & NFSMNTP_DELEGISSUED) == 0 && (!NFSMNT_RDONLY(mp) || (flags & OPENWRITE) == 0) && (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN)) { if ((flags & OPENREAD) != 0) openmode |= NFSV4OPEN_ACCESSREAD; if ((flags & OPENWRITE) != 0) openmode |= NFSV4OPEN_ACCESSWRITE; } NFSUNLOCKMNT(nmp); newvp = NULLVP; NFSINCRGLOBAL(nfsstatsv1.lookupcache_misses); nanouptime(&ts); error = nfsrpc_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, td, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag, openmode); if (dattrflag) (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, 0, 1); if (error) { if (newvp != NULLVP) { vput(newvp); *vpp = NULLVP; } if (error != ENOENT) { if (NFS_ISV4(dvp)) error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); return (error); } /* The requested file was not found. */ if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN)) { /* * XXX: UFS does a full VOP_ACCESS(dvp, * VWRITE) here instead of just checking * MNT_RDONLY. */ if (mp->mnt_flag & MNT_RDONLY) return (EROFS); - cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } if ((cnp->cn_flags & MAKEENTRY) != 0 && dattrflag) { /* * Cache the modification time of the parent * directory from the post-op attributes in * the name cache entry. The negative cache * entry will be ignored once the directory * has changed. Don't bother adding the entry * if the directory has already changed. */ NFSLOCKNODE(np); if (timespeccmp(&np->n_vattr.na_mtime, &dnfsva.na_mtime, ==)) { NFSUNLOCKNODE(np); cache_enter_time(dvp, NULL, cnp, &dnfsva.na_mtime, NULL); } else NFSUNLOCKNODE(np); } return (ENOENT); } /* * Handle RENAME case... */ if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) { if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) { free(nfhp, M_NFSFH); return (EISDIR); } error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, LK_EXCLUSIVE); if (error) return (error); newvp = NFSTOV(np); /* * If n_localmodtime >= time before RPC, then * a file modification operation, such as * VOP_SETATTR() of size, has occurred while * the Lookup RPC and acquisition of the vnode * happened. As such, the attributes might * be stale, with possibly an incorrect size. */ NFSLOCKNODE(np); if (timespecisset(&np->n_localmodtime) && timespeccmp(&np->n_localmodtime, &ts, >=)) { NFSCL_DEBUG(4, "nfs_lookup: rename localmod " "stale attributes\n"); attrflag = 0; } NFSUNLOCKNODE(np); if (attrflag) (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); *vpp = newvp; - cnp->cn_flags |= SAVENAME; return (0); } if (flags & ISDOTDOT) { ltype = NFSVOPISLOCKED(dvp); error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); NFSVOPUNLOCK(dvp); error = vfs_busy(mp, 0); NFSVOPLOCK(dvp, ltype | LK_RETRY); vfs_rel(mp); if (error == 0 && VN_IS_DOOMED(dvp)) { vfs_unbusy(mp); error = ENOENT; } if (error != 0) return (error); } NFSVOPUNLOCK(dvp); error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, cnp->cn_lkflags); if (error == 0) newvp = NFSTOV(np); vfs_unbusy(mp); if (newvp != dvp) NFSVOPLOCK(dvp, ltype | LK_RETRY); if (VN_IS_DOOMED(dvp)) { if (error == 0) { if (newvp == dvp) vrele(newvp); else vput(newvp); } error = ENOENT; } if (error != 0) return (error); if (attrflag) (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); } else if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) { free(nfhp, M_NFSFH); VREF(dvp); newvp = dvp; if (attrflag) (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); } else { error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, cnp->cn_lkflags); if (error) return (error); newvp = NFSTOV(np); /* * If n_localmodtime >= time before RPC, then * a file modification operation, such as * VOP_SETATTR() of size, has occurred while * the Lookup RPC and acquisition of the vnode * happened. As such, the attributes might * be stale, with possibly an incorrect size. */ NFSLOCKNODE(np); if (timespecisset(&np->n_localmodtime) && timespeccmp(&np->n_localmodtime, &ts, >=)) { NFSCL_DEBUG(4, "nfs_lookup: localmod " "stale attributes\n"); attrflag = 0; } NFSUNLOCKNODE(np); if (attrflag) (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); else if ((flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) && !(np->n_flag & NMODIFIED)) { /* * Flush the attribute cache when opening a * leaf node to ensure that fresh attributes * are fetched in nfs_open() since we did not * fetch attributes from the LOOKUP reply. */ NFSLOCKNODE(np); np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp); NFSUNLOCKNODE(np); } } - if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) - cnp->cn_flags |= SAVENAME; if ((cnp->cn_flags & MAKEENTRY) && dvp != newvp && (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN)) && attrflag != 0 && (newvp->v_type != VDIR || dattrflag != 0)) cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime, newvp->v_type != VDIR ? NULL : &dnfsva.na_ctime); *vpp = newvp; return (0); } /* * nfs read call. * Just call ncl_bioread() to do the work. */ static int nfs_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; switch (vp->v_type) { case VREG: return (ncl_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); case VDIR: return (EISDIR); default: return (EOPNOTSUPP); } } /* * nfs readlink call */ static int nfs_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; if (vp->v_type != VLNK) return (EINVAL); return (ncl_bioread(vp, ap->a_uio, 0, ap->a_cred)); } /* * Do a readlink rpc. * Called by ncl_doio() from below the buffer cache. */ int ncl_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred) { int error, ret, attrflag; struct nfsvattr nfsva; error = nfsrpc_readlink(vp, uiop, cred, uiop->uio_td, &nfsva, &attrflag); if (attrflag) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (ret && !error) error = ret; } if (error && NFS_ISV4(vp)) error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0); return (error); } /* * nfs read rpc call * Ditto above */ int ncl_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred) { int error, ret, attrflag; struct nfsvattr nfsva; struct nfsmount *nmp; nmp = VFSTONFS(vp->v_mount); error = EIO; attrflag = 0; if (NFSHASPNFS(nmp)) error = nfscl_doiods(vp, uiop, NULL, NULL, NFSV4OPEN_ACCESSREAD, 0, cred, uiop->uio_td); NFSCL_DEBUG(4, "readrpc: aft doiods=%d\n", error); if (error != 0) error = nfsrpc_read(vp, uiop, cred, uiop->uio_td, &nfsva, &attrflag); if (attrflag) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (ret && !error) error = ret; } if (error && NFS_ISV4(vp)) error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0); return (error); } /* * nfs write call */ int ncl_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred, int *iomode, int *must_commit, int called_from_strategy, int ioflag) { struct nfsvattr nfsva; int error, attrflag, ret; struct nfsmount *nmp; nmp = VFSTONFS(vp->v_mount); error = EIO; attrflag = 0; if (NFSHASPNFS(nmp)) error = nfscl_doiods(vp, uiop, iomode, must_commit, NFSV4OPEN_ACCESSWRITE, 0, cred, uiop->uio_td); NFSCL_DEBUG(4, "writerpc: aft doiods=%d\n", error); if (error != 0) error = nfsrpc_write(vp, uiop, iomode, must_commit, cred, uiop->uio_td, &nfsva, &attrflag, called_from_strategy, ioflag); if (attrflag) { if (VTONFS(vp)->n_flag & ND_NFSV4) ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 1, 1); else ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (ret && !error) error = ret; } if (DOINGASYNC(vp)) *iomode = NFSWRITE_FILESYNC; if (error && NFS_ISV4(vp)) error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0); return (error); } /* * nfs mknod rpc * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the * mode set to specify the file type and the size field for rdev. */ static int nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap) { struct nfsvattr nfsva, dnfsva; struct vnode *newvp = NULL; struct nfsnode *np = NULL, *dnp; struct nfsfh *nfhp; struct vattr vattr; int error = 0, attrflag, dattrflag; u_int32_t rdev; if (vap->va_type == VCHR || vap->va_type == VBLK) rdev = vap->va_rdev; else if (vap->va_type == VFIFO || vap->va_type == VSOCK) rdev = 0xffffffff; else return (EOPNOTSUPP); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred))) return (error); error = nfsrpc_mknod(dvp, cnp->cn_nameptr, cnp->cn_namelen, vap, rdev, vap->va_type, cnp->cn_cred, curthread, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag); if (!error) { if (!nfhp) (void) nfsrpc_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, curthread, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag, 0); if (nfhp) error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, curthread, &np, LK_EXCLUSIVE); } if (dattrflag) (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, 0, 1); if (!error) { newvp = NFSTOV(np); if (attrflag != 0) { error = nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); if (error != 0) vput(newvp); } } if (!error) { *vpp = newvp; } else if (NFS_ISV4(dvp)) { error = nfscl_maperr(curthread, error, vap->va_uid, vap->va_gid); } dnp = VTONFS(dvp); NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (!dattrflag) { dnp->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } NFSUNLOCKNODE(dnp); return (error); } /* * nfs mknod vop * just call nfs_mknodrpc() to do the work. */ /* ARGSUSED */ static int nfs_mknod(struct vop_mknod_args *ap) { return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap)); } static struct mtx nfs_cverf_mtx; MTX_SYSINIT(nfs_cverf_mtx, &nfs_cverf_mtx, "NFS create verifier mutex", MTX_DEF); static nfsquad_t nfs_get_cverf(void) { static nfsquad_t cverf; nfsquad_t ret; static int cverf_initialized = 0; mtx_lock(&nfs_cverf_mtx); if (cverf_initialized == 0) { cverf.lval[0] = arc4random(); cverf.lval[1] = arc4random(); cverf_initialized = 1; } else cverf.qval++; ret = cverf; mtx_unlock(&nfs_cverf_mtx); return (ret); } /* * nfs file create call */ static int nfs_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct nfsnode *np = NULL, *dnp; struct vnode *newvp = NULL; struct nfsmount *nmp; struct nfsvattr dnfsva, nfsva; struct nfsfh *nfhp; nfsquad_t cverf; int error = 0, attrflag, dattrflag, fmode = 0; struct vattr vattr; /* * Oops, not for me.. */ if (vap->va_type == VSOCK) return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap)); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred))) return (error); if (vap->va_vaflags & VA_EXCLUSIVE) fmode |= O_EXCL; dnp = VTONFS(dvp); nmp = VFSTONFS(dvp->v_mount); again: /* For NFSv4, wait until any remove is done. */ NFSLOCKNODE(dnp); while (NFSHASNFSV4(nmp) && (dnp->n_flag & NREMOVEINPROG)) { dnp->n_flag |= NREMOVEWANT; (void) msleep((caddr_t)dnp, &dnp->n_mtx, PZERO, "nfscrt", 0); } NFSUNLOCKNODE(dnp); cverf = nfs_get_cverf(); error = nfsrpc_create(dvp, cnp->cn_nameptr, cnp->cn_namelen, vap, cverf, fmode, cnp->cn_cred, curthread, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag); if (!error) { if (nfhp == NULL) (void) nfsrpc_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, curthread, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag, 0); if (nfhp != NULL) error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, curthread, &np, LK_EXCLUSIVE); } if (dattrflag) (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, 0, 1); if (!error) { newvp = NFSTOV(np); if (attrflag == 0) error = nfsrpc_getattr(newvp, cnp->cn_cred, curthread, &nfsva); if (error == 0) error = nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); } if (error) { if (newvp != NULL) { vput(newvp); newvp = NULL; } if (NFS_ISV34(dvp) && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) { fmode &= ~O_EXCL; goto again; } } else if (NFS_ISV34(dvp) && (fmode & O_EXCL)) { if (nfscl_checksattr(vap, &nfsva)) { error = nfsrpc_setattr(newvp, vap, NULL, cnp->cn_cred, curthread, &nfsva, &attrflag); if (error && (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL)) { /* try again without setting uid/gid */ vap->va_uid = (uid_t)VNOVAL; vap->va_gid = (uid_t)VNOVAL; error = nfsrpc_setattr(newvp, vap, NULL, cnp->cn_cred, curthread, &nfsva, &attrflag); } if (attrflag) (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); if (error != 0) vput(newvp); } } if (!error) { if ((cnp->cn_flags & MAKEENTRY) && attrflag) { if (dvp != newvp) cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime, NULL); else printf("nfs_create: bogus NFS server returned " "the directory as the new file object\n"); } *ap->a_vpp = newvp; } else if (NFS_ISV4(dvp)) { error = nfscl_maperr(curthread, error, vap->va_uid, vap->va_gid); } NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (!dattrflag) { dnp->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } NFSUNLOCKNODE(dnp); return (error); } /* * nfs file remove call * To try and make nfs semantics closer to ufs semantics, a file that has * other processes using the vnode is renamed instead of removed and then * removed later on the last close. * - If v_usecount > 1 * If a rename is not already in the works * call nfs_sillyrename() to set it up * else * do the remove rpc */ static int nfs_remove(struct vop_remove_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct nfsnode *np = VTONFS(vp); int error = 0; struct vattr vattr; - KASSERT((cnp->cn_flags & HASBUF) != 0, ("nfs_remove: no name")); KASSERT(vrefcnt(vp) > 0, ("nfs_remove: bad v_usecount")); if (vp->v_type == VDIR) error = EPERM; else if (vrefcnt(vp) == 1 || (np->n_sillyrename && VOP_GETATTR(vp, &vattr, cnp->cn_cred) == 0 && vattr.va_nlink > 1)) { /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is * minimized. Without node locking it can still happen, such * that an I/O op returns ESTALE, but since you get this if * another host removes the file.. */ cache_purge(vp); /* * throw away biocache buffers, mainly to avoid * unnecessary delayed writes later. */ error = ncl_vinvalbuf(vp, 0, curthread, 1); if (error != EINTR && error != EIO) /* Do the rpc */ error = nfs_removerpc(dvp, vp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, curthread); /* * Kludge City: If the first reply to the remove rpc is lost.. * the reply to the retransmitted request will be ENOENT * since the file was in fact removed * Therefore, we cheat and return success. */ if (error == ENOENT) error = 0; } else if (!np->n_sillyrename) error = nfs_sillyrename(dvp, vp, cnp); NFSLOCKNODE(np); np->n_attrstamp = 0; NFSUNLOCKNODE(np); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); return (error); } /* * nfs file remove rpc called from nfs_inactive */ int ncl_removeit(struct sillyrename *sp, struct vnode *vp) { /* * Make sure that the directory vnode is still valid. * XXX we should lock sp->s_dvp here. */ if (sp->s_dvp->v_type == VBAD) return (0); return (nfs_removerpc(sp->s_dvp, vp, sp->s_name, sp->s_namlen, sp->s_cred, NULL)); } /* * Nfs remove rpc, called from nfs_remove() and ncl_removeit(). */ static int nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name, int namelen, struct ucred *cred, struct thread *td) { struct nfsvattr dnfsva; struct nfsnode *dnp = VTONFS(dvp); int error = 0, dattrflag; NFSLOCKNODE(dnp); dnp->n_flag |= NREMOVEINPROG; NFSUNLOCKNODE(dnp); error = nfsrpc_remove(dvp, name, namelen, vp, cred, td, &dnfsva, &dattrflag); NFSLOCKNODE(dnp); if ((dnp->n_flag & NREMOVEWANT)) { dnp->n_flag &= ~(NREMOVEWANT | NREMOVEINPROG); NFSUNLOCKNODE(dnp); wakeup((caddr_t)dnp); } else { dnp->n_flag &= ~NREMOVEINPROG; NFSUNLOCKNODE(dnp); } if (dattrflag) (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, 0, 1); NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (!dattrflag) { dnp->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } NFSUNLOCKNODE(dnp); if (error && NFS_ISV4(dvp)) error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); return (error); } /* * nfs file rename call */ static int nfs_rename(struct vop_rename_args *ap) { struct vnode *fvp = ap->a_fvp; struct vnode *tvp = ap->a_tvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tdvp = ap->a_tdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct nfsnode *fnp = VTONFS(ap->a_fvp); struct nfsnode *tdnp = VTONFS(ap->a_tdvp); struct nfsv4node *newv4 = NULL; int error; - KASSERT((tcnp->cn_flags & HASBUF) != 0 && - (fcnp->cn_flags & HASBUF) != 0, ("nfs_rename: no name")); /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } if (fvp == tvp) { printf("nfs_rename: fvp == tvp (can't happen)\n"); error = 0; goto out; } if ((error = NFSVOPLOCK(fvp, LK_EXCLUSIVE)) != 0) goto out; /* * We have to flush B_DELWRI data prior to renaming * the file. If we don't, the delayed-write buffers * can be flushed out later after the file has gone stale * under NFSV3. NFSV2 does not have this problem because * ( as far as I can tell ) it flushes dirty buffers more * often. * * Skip the rename operation if the fsync fails, this can happen * due to the server's volume being full, when we pushed out data * that was written back to our cache earlier. Not checking for * this condition can result in potential (silent) data loss. */ error = VOP_FSYNC(fvp, MNT_WAIT, curthread); NFSVOPUNLOCK(fvp); if (!error && tvp) error = VOP_FSYNC(tvp, MNT_WAIT, curthread); if (error) goto out; /* * If the tvp exists and is in use, sillyrename it before doing the * rename of the new file over it. * XXX Can't sillyrename a directory. */ if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename && tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) { vput(tvp); tvp = NULL; } error = nfs_renamerpc(fdvp, fvp, fcnp->cn_nameptr, fcnp->cn_namelen, tdvp, tvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, curthread); if (error == 0 && NFS_ISV4(tdvp)) { /* * For NFSv4, check to see if it is the same name and * replace the name, if it is different. */ newv4 = malloc( sizeof (struct nfsv4node) + tdnp->n_fhp->nfh_len + tcnp->cn_namelen - 1, M_NFSV4NODE, M_WAITOK); NFSLOCKNODE(tdnp); NFSLOCKNODE(fnp); if (fnp->n_v4 != NULL && fvp->v_type == VREG && (fnp->n_v4->n4_namelen != tcnp->cn_namelen || NFSBCMP(tcnp->cn_nameptr, NFS4NODENAME(fnp->n_v4), tcnp->cn_namelen) || tdnp->n_fhp->nfh_len != fnp->n_v4->n4_fhlen || NFSBCMP(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data, tdnp->n_fhp->nfh_len))) { #ifdef notdef { char nnn[100]; int nnnl; nnnl = (tcnp->cn_namelen < 100) ? tcnp->cn_namelen : 99; bcopy(tcnp->cn_nameptr, nnn, nnnl); nnn[nnnl] = '\0'; printf("ren replace=%s\n",nnn); } #endif free(fnp->n_v4, M_NFSV4NODE); fnp->n_v4 = newv4; newv4 = NULL; fnp->n_v4->n4_fhlen = tdnp->n_fhp->nfh_len; fnp->n_v4->n4_namelen = tcnp->cn_namelen; NFSBCOPY(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data, tdnp->n_fhp->nfh_len); NFSBCOPY(tcnp->cn_nameptr, NFS4NODENAME(fnp->n_v4), tcnp->cn_namelen); } NFSUNLOCKNODE(tdnp); NFSUNLOCKNODE(fnp); if (newv4 != NULL) free(newv4, M_NFSV4NODE); } if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); /* * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs file rename rpc called from nfs_remove() above */ static int nfs_renameit(struct vnode *sdvp, struct vnode *svp, struct componentname *scnp, struct sillyrename *sp) { return (nfs_renamerpc(sdvp, svp, scnp->cn_nameptr, scnp->cn_namelen, sdvp, NULL, sp->s_name, sp->s_namlen, scnp->cn_cred, curthread)); } /* * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit(). */ static int nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr, int fnamelen, struct vnode *tdvp, struct vnode *tvp, char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td) { struct nfsvattr fnfsva, tnfsva; struct nfsnode *fdnp = VTONFS(fdvp); struct nfsnode *tdnp = VTONFS(tdvp); int error = 0, fattrflag, tattrflag; error = nfsrpc_rename(fdvp, fvp, fnameptr, fnamelen, tdvp, tvp, tnameptr, tnamelen, cred, td, &fnfsva, &tnfsva, &fattrflag, &tattrflag); NFSLOCKNODE(fdnp); fdnp->n_flag |= NMODIFIED; if (fattrflag != 0) { NFSUNLOCKNODE(fdnp); (void) nfscl_loadattrcache(&fdvp, &fnfsva, NULL, 0, 1); } else { fdnp->n_attrstamp = 0; NFSUNLOCKNODE(fdnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(fdvp); } NFSLOCKNODE(tdnp); tdnp->n_flag |= NMODIFIED; if (tattrflag != 0) { NFSUNLOCKNODE(tdnp); (void) nfscl_loadattrcache(&tdvp, &tnfsva, NULL, 0, 1); } else { tdnp->n_attrstamp = 0; NFSUNLOCKNODE(tdnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp); } if (error && NFS_ISV4(fdvp)) error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); return (error); } /* * nfs hard link create call */ static int nfs_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct nfsnode *np, *tdnp; struct nfsvattr nfsva, dnfsva; int error = 0, attrflag, dattrflag; /* * Push all writes to the server, so that the attribute cache * doesn't get "out of sync" with the server. * XXX There should be a better way! */ VOP_FSYNC(vp, MNT_WAIT, curthread); error = nfsrpc_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, curthread, &dnfsva, &nfsva, &attrflag, &dattrflag); tdnp = VTONFS(tdvp); NFSLOCKNODE(tdnp); tdnp->n_flag |= NMODIFIED; if (dattrflag != 0) { NFSUNLOCKNODE(tdnp); (void) nfscl_loadattrcache(&tdvp, &dnfsva, NULL, 0, 1); } else { tdnp->n_attrstamp = 0; NFSUNLOCKNODE(tdnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp); } if (attrflag) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); else { np = VTONFS(vp); NFSLOCKNODE(np); np->n_attrstamp = 0; NFSUNLOCKNODE(np); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); } /* * If negative lookup caching is enabled, I might as well * add an entry for this node. Not necessary for correctness, * but if negative caching is enabled, then the system * must care about lookup caching hit rate, so... */ if (VFSTONFS(vp->v_mount)->nm_negnametimeo != 0 && (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && error == 0) { if (tdvp != vp) cache_enter_time(tdvp, vp, cnp, &nfsva.na_ctime, NULL); else printf("nfs_link: bogus NFS server returned " "the directory as the new link\n"); } if (error && NFS_ISV4(vp)) error = nfscl_maperr(curthread, error, (uid_t)0, (gid_t)0); return (error); } /* * nfs symbolic link create call */ static int nfs_symlink(struct vop_symlink_args *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct nfsvattr nfsva, dnfsva; struct nfsfh *nfhp; struct nfsnode *np = NULL, *dnp; struct vnode *newvp = NULL; int error = 0, attrflag, dattrflag, ret; vap->va_type = VLNK; error = nfsrpc_symlink(dvp, cnp->cn_nameptr, cnp->cn_namelen, ap->a_target, vap, cnp->cn_cred, curthread, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag); if (nfhp) { ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, curthread, &np, LK_EXCLUSIVE); if (!ret) newvp = NFSTOV(np); else if (!error) error = ret; } if (newvp != NULL) { if (attrflag) (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); } else if (!error) { /* * If we do not have an error and we could not extract the * newvp from the response due to the request being NFSv2, we * have to do a lookup in order to obtain a newvp to return. */ error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, curthread, &np); if (!error) newvp = NFSTOV(np); } if (error) { if (newvp) vput(newvp); if (NFS_ISV4(dvp)) error = nfscl_maperr(curthread, error, vap->va_uid, vap->va_gid); } else { *ap->a_vpp = newvp; } dnp = VTONFS(dvp); NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (dattrflag != 0) { NFSUNLOCKNODE(dnp); (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, 0, 1); } else { dnp->n_attrstamp = 0; NFSUNLOCKNODE(dnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } /* * If negative lookup caching is enabled, I might as well * add an entry for this node. Not necessary for correctness, * but if negative caching is enabled, then the system * must care about lookup caching hit rate, so... */ if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 && (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && error == 0) { if (dvp != newvp) cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime, NULL); else printf("nfs_symlink: bogus NFS server returned " "the directory as the new file object\n"); } return (error); } /* * nfs make dir call */ static int nfs_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct nfsnode *np = NULL, *dnp; struct vnode *newvp = NULL; struct vattr vattr; struct nfsfh *nfhp; struct nfsvattr nfsva, dnfsva; int error = 0, attrflag, dattrflag, ret; if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0) return (error); vap->va_type = VDIR; error = nfsrpc_mkdir(dvp, cnp->cn_nameptr, cnp->cn_namelen, vap, cnp->cn_cred, curthread, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag); dnp = VTONFS(dvp); NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (dattrflag != 0) { NFSUNLOCKNODE(dnp); (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, 0, 1); } else { dnp->n_attrstamp = 0; NFSUNLOCKNODE(dnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } if (nfhp) { ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, curthread, &np, LK_EXCLUSIVE); if (!ret) { newvp = NFSTOV(np); if (attrflag) (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); } else if (!error) error = ret; } if (!error && newvp == NULL) { error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, curthread, &np); if (!error) { newvp = NFSTOV(np); if (newvp->v_type != VDIR) error = EEXIST; } } if (error) { if (newvp) vput(newvp); if (NFS_ISV4(dvp)) error = nfscl_maperr(curthread, error, vap->va_uid, vap->va_gid); } else { /* * If negative lookup caching is enabled, I might as well * add an entry for this node. Not necessary for correctness, * but if negative caching is enabled, then the system * must care about lookup caching hit rate, so... */ if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 && (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && dattrflag != 0) { if (dvp != newvp) cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime, &dnfsva.na_ctime); else printf("nfs_mkdir: bogus NFS server returned " "the directory that the directory was " "created in as the new file object\n"); } *ap->a_vpp = newvp; } return (error); } /* * nfs remove directory call */ static int nfs_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct nfsnode *dnp; struct nfsvattr dnfsva; int error, dattrflag; if (dvp == vp) return (EINVAL); error = nfsrpc_rmdir(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, curthread, &dnfsva, &dattrflag); dnp = VTONFS(dvp); NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (dattrflag != 0) { NFSUNLOCKNODE(dnp); (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, 0, 1); } else { dnp->n_attrstamp = 0; NFSUNLOCKNODE(dnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } cache_purge(dvp); cache_purge(vp); if (error && NFS_ISV4(dvp)) error = nfscl_maperr(curthread, error, (uid_t)0, (gid_t)0); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs readdir call */ static int nfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct uio *uio = ap->a_uio; ssize_t tresid, left; int error = 0; struct vattr vattr; if (ap->a_eofflag != NULL) *ap->a_eofflag = 0; if (vp->v_type != VDIR) return(EPERM); /* * First, check for hit on the EOF offset cache */ NFSLOCKNODE(np); if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset && (np->n_flag & NMODIFIED) == 0) { NFSUNLOCKNODE(np); if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) { NFSLOCKNODE(np); if ((NFS_ISV4(vp) && np->n_change == vattr.va_filerev) || !NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) { NFSUNLOCKNODE(np); NFSINCRGLOBAL(nfsstatsv1.direofcache_hits); if (ap->a_eofflag != NULL) *ap->a_eofflag = 1; return (0); } else NFSUNLOCKNODE(np); } } else NFSUNLOCKNODE(np); /* * NFS always guarantees that directory entries don't straddle * DIRBLKSIZ boundaries. As such, we need to limit the size * to an exact multiple of DIRBLKSIZ, to avoid copying a partial * directory entry. */ left = uio->uio_resid % DIRBLKSIZ; if (left == uio->uio_resid) return (EINVAL); uio->uio_resid -= left; /* * Call ncl_bioread() to do the real work. */ tresid = uio->uio_resid; error = ncl_bioread(vp, uio, 0, ap->a_cred); if (!error && uio->uio_resid == tresid) { NFSINCRGLOBAL(nfsstatsv1.direofcache_misses); if (ap->a_eofflag != NULL) *ap->a_eofflag = 1; } /* Add the partial DIRBLKSIZ (left) back in. */ uio->uio_resid += left; return (error); } /* * Readdir rpc call. * Called from below the buffer cache by ncl_doio(). */ int ncl_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred, struct thread *td) { struct nfsvattr nfsva; nfsuint64 *cookiep, cookie; struct nfsnode *dnp = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, eof, attrflag; KASSERT(uiop->uio_iovcnt == 1 && (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 && (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0, ("nfs readdirrpc bad uio")); /* * If there is no cookie, assume directory was stale. */ ncl_dircookie_lock(dnp); NFSUNLOCKNODE(dnp); cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) { cookie = *cookiep; ncl_dircookie_unlock(dnp); } else { ncl_dircookie_unlock(dnp); return (NFSERR_BAD_COOKIE); } if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp)) (void)ncl_fsinfo(nmp, vp, cred, td); error = nfsrpc_readdir(vp, uiop, &cookie, cred, td, &nfsva, &attrflag, &eof); if (attrflag) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (!error) { /* * We are now either at the end of the directory or have filled * the block. */ if (eof) { NFSLOCKNODE(dnp); dnp->n_direofoffset = uiop->uio_offset; NFSUNLOCKNODE(dnp); } else { if (uiop->uio_resid > 0) printf("EEK! readdirrpc resid > 0\n"); ncl_dircookie_lock(dnp); NFSUNLOCKNODE(dnp); cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; ncl_dircookie_unlock(dnp); } } else if (NFS_ISV4(vp)) { error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); } return (error); } /* * NFS V3 readdir plus RPC. Used in place of ncl_readdirrpc(). */ int ncl_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred, struct thread *td) { struct nfsvattr nfsva; nfsuint64 *cookiep, cookie; struct nfsnode *dnp = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, attrflag, eof; KASSERT(uiop->uio_iovcnt == 1 && (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 && (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0, ("nfs readdirplusrpc bad uio")); /* * If there is no cookie, assume directory was stale. */ ncl_dircookie_lock(dnp); NFSUNLOCKNODE(dnp); cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) { cookie = *cookiep; ncl_dircookie_unlock(dnp); } else { ncl_dircookie_unlock(dnp); return (NFSERR_BAD_COOKIE); } if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp)) (void)ncl_fsinfo(nmp, vp, cred, td); error = nfsrpc_readdirplus(vp, uiop, &cookie, cred, td, &nfsva, &attrflag, &eof); if (attrflag) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (!error) { /* * We are now either at end of the directory or have filled the * the block. */ if (eof) { NFSLOCKNODE(dnp); dnp->n_direofoffset = uiop->uio_offset; NFSUNLOCKNODE(dnp); } else { if (uiop->uio_resid > 0) printf("EEK! readdirplusrpc resid > 0\n"); ncl_dircookie_lock(dnp); NFSUNLOCKNODE(dnp); cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; ncl_dircookie_unlock(dnp); } } else if (NFS_ISV4(vp)) { error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); } return (error); } /* * Silly rename. To make the NFS filesystem that is stateless look a little * more like the "ufs" a remove of an active vnode is translated to a rename * to a funny looking filename that is removed by nfs_inactive on the * nfsnode. There is the potential for another process on a different client * to create the same funny name between the nfs_lookitup() fails and the * nfs_rename() completes, but... */ static int nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct sillyrename *sp; struct nfsnode *np; int error; short pid; unsigned int lticks; cache_purge(dvp); np = VTONFS(vp); KASSERT(vp->v_type != VDIR, ("nfs: sillyrename dir")); sp = malloc(sizeof (struct sillyrename), M_NEWNFSREQ, M_WAITOK); sp->s_cred = crhold(cnp->cn_cred); sp->s_dvp = dvp; VREF(dvp); /* * Fudge together a funny name. * Changing the format of the funny name to accommodate more * sillynames per directory. * The name is now changed to .nfs...4, where ticks is * CPU ticks since boot. */ pid = curthread->td_proc->p_pid; lticks = (unsigned int)ticks; for ( ; ; ) { sp->s_namlen = sprintf(sp->s_name, ".nfs.%08x.%04x4.4", lticks, pid); if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, curthread, NULL)) break; lticks++; } error = nfs_renameit(dvp, vp, cnp, sp); if (error) goto bad; error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, curthread, &np); np->n_sillyrename = sp; return (0); bad: vrele(sp->s_dvp); crfree(sp->s_cred); free(sp, M_NEWNFSREQ); return (error); } /* * Look up a file name and optionally either update the file handle or * allocate an nfsnode, depending on the value of npp. * npp == NULL --> just do the lookup * *npp == NULL --> allocate a new nfsnode and make sure attributes are * handled too * *npp != NULL --> update the file handle in the vnode */ static int nfs_lookitup(struct vnode *dvp, char *name, int len, struct ucred *cred, struct thread *td, struct nfsnode **npp) { struct vnode *newvp = NULL, *vp; struct nfsnode *np, *dnp = VTONFS(dvp); struct nfsfh *nfhp, *onfhp; struct nfsvattr nfsva, dnfsva; struct componentname cn; int error = 0, attrflag, dattrflag; u_int hash; struct timespec ts; nanouptime(&ts); error = nfsrpc_lookup(dvp, name, len, cred, td, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag, 0); if (dattrflag) (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, 0, 1); if (npp && !error) { if (*npp != NULL) { np = *npp; vp = NFSTOV(np); /* * For NFSv4, check to see if it is the same name and * replace the name, if it is different. */ if (np->n_v4 != NULL && nfsva.na_type == VREG && (np->n_v4->n4_namelen != len || NFSBCMP(name, NFS4NODENAME(np->n_v4), len) || dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen || NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data, dnp->n_fhp->nfh_len))) { #ifdef notdef { char nnn[100]; int nnnl; nnnl = (len < 100) ? len : 99; bcopy(name, nnn, nnnl); nnn[nnnl] = '\0'; printf("replace=%s\n",nnn); } #endif free(np->n_v4, M_NFSV4NODE); np->n_v4 = malloc( sizeof (struct nfsv4node) + dnp->n_fhp->nfh_len + len - 1, M_NFSV4NODE, M_WAITOK); np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len; np->n_v4->n4_namelen = len; NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data, dnp->n_fhp->nfh_len); NFSBCOPY(name, NFS4NODENAME(np->n_v4), len); } hash = fnv_32_buf(nfhp->nfh_fh, nfhp->nfh_len, FNV1_32_INIT); onfhp = np->n_fhp; /* * Rehash node for new file handle. */ vfs_hash_rehash(vp, hash); np->n_fhp = nfhp; if (onfhp != NULL) free(onfhp, M_NFSFH); newvp = NFSTOV(np); } else if (NFS_CMPFH(dnp, nfhp->nfh_fh, nfhp->nfh_len)) { free(nfhp, M_NFSFH); VREF(dvp); newvp = dvp; } else { cn.cn_nameptr = name; cn.cn_namelen = len; error = nfscl_nget(dvp->v_mount, dvp, nfhp, &cn, td, &np, LK_EXCLUSIVE); if (error) return (error); newvp = NFSTOV(np); /* * If n_localmodtime >= time before RPC, then * a file modification operation, such as * VOP_SETATTR() of size, has occurred while * the Lookup RPC and acquisition of the vnode * happened. As such, the attributes might * be stale, with possibly an incorrect size. */ NFSLOCKNODE(np); if (timespecisset(&np->n_localmodtime) && timespeccmp(&np->n_localmodtime, &ts, >=)) { NFSCL_DEBUG(4, "nfs_lookitup: localmod " "stale attributes\n"); attrflag = 0; } NFSUNLOCKNODE(np); } if (!attrflag && *npp == NULL) { if (newvp == dvp) vrele(newvp); else vput(newvp); return (ENOENT); } if (attrflag) (void) nfscl_loadattrcache(&newvp, &nfsva, NULL, 0, 1); } if (npp && *npp == NULL) { if (error) { if (newvp) { if (newvp == dvp) vrele(newvp); else vput(newvp); } } else *npp = np; } if (error && NFS_ISV4(dvp)) error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); return (error); } /* * Nfs Version 3 and 4 commit rpc */ int ncl_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred, struct thread *td) { struct nfsvattr nfsva; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *np; struct uio uio; int error, attrflag; np = VTONFS(vp); error = EIO; attrflag = 0; if (NFSHASPNFS(nmp) && (np->n_flag & NDSCOMMIT) != 0) { uio.uio_offset = offset; uio.uio_resid = cnt; error = nfscl_doiods(vp, &uio, NULL, NULL, NFSV4OPEN_ACCESSWRITE, 1, cred, td); if (error != 0) { NFSLOCKNODE(np); np->n_flag &= ~NDSCOMMIT; NFSUNLOCKNODE(np); } } if (error != 0) { mtx_lock(&nmp->nm_mtx); if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) { mtx_unlock(&nmp->nm_mtx); return (0); } mtx_unlock(&nmp->nm_mtx); error = nfsrpc_commit(vp, offset, cnt, cred, td, &nfsva, &attrflag); } if (attrflag != 0) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (error != 0 && NFS_ISV4(vp)) error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); return (error); } /* * Strategy routine. * For async requests when nfsiod(s) are running, queue the request by * calling ncl_asyncio(), otherwise just all ncl_doio() to do the * request. */ static int nfs_strategy(struct vop_strategy_args *ap) { struct buf *bp; struct vnode *vp; struct ucred *cr; bp = ap->a_bp; vp = ap->a_vp; KASSERT(bp->b_vp == vp, ("missing b_getvp")); KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp)); if (vp->v_type == VREG && bp->b_blkno == bp->b_lblkno) bp->b_blkno = bp->b_lblkno * (vp->v_bufobj.bo_bsize / DEV_BSIZE); if (bp->b_iocmd == BIO_READ) cr = bp->b_rcred; else cr = bp->b_wcred; /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion * otherwise just do it ourselves. */ if ((bp->b_flags & B_ASYNC) == 0 || ncl_asyncio(VFSTONFS(vp->v_mount), bp, NOCRED, curthread)) (void) ncl_doio(vp, bp, cr, curthread, 1); return (0); } /* * fsync vnode op. Just call ncl_flush() with commit == 1. */ /* ARGSUSED */ static int nfs_fsync(struct vop_fsync_args *ap) { if (ap->a_vp->v_type != VREG) { /* * For NFS, metadata is changed synchronously on the server, * so there is nothing to flush. Also, ncl_flush() clears * the NMODIFIED flag and that shouldn't be done here for * directories. */ return (0); } return (ncl_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1, 0)); } /* * Flush all the blocks associated with a vnode. * Walk through the buffer pool and push any dirty pages * associated with the vnode. * If the called_from_renewthread argument is TRUE, it has been called * from the NFSv4 renew thread and, as such, cannot block indefinitely * waiting for a buffer write to complete. */ int ncl_flush(struct vnode *vp, int waitfor, struct thread *td, int commit, int called_from_renewthread) { struct nfsnode *np = VTONFS(vp); struct buf *bp; int i; struct buf *nbp; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos; int passone = 1, trycnt = 0; u_quad_t off, endoff, toff; struct ucred* wcred = NULL; struct buf **bvec = NULL; struct bufobj *bo; #ifndef NFS_COMMITBVECSIZ #define NFS_COMMITBVECSIZ 20 #endif struct buf *bvec_on_stack[NFS_COMMITBVECSIZ]; u_int bvecsize = 0, bveccount; struct timespec ts; if (called_from_renewthread != 0) slptimeo = hz; if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; if (!commit) passone = 0; bo = &vp->v_bufobj; /* * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the * server, but has not been committed to stable storage on the server * yet. On the first pass, the byte range is worked out and the commit * rpc is done. On the second pass, ncl_writebp() is called to do the * job. */ again: off = (u_quad_t)-1; endoff = 0; bvecpos = 0; if (NFS_ISV34(vp) && commit) { if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); /* * Count up how many buffers waiting for a commit. */ bveccount = 0; BO_LOCK(bo); TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (!BUF_ISLOCKED(bp) && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bveccount++; } /* * Allocate space to remember the list of bufs to commit. It is * important to use M_NOWAIT here to avoid a race with nfs_write. * If we can't get memory (for whatever reason), we will end up * committing the buffers one-by-one in the loop below. */ if (bveccount > NFS_COMMITBVECSIZ) { /* * Release the vnode interlock to avoid a lock * order reversal. */ BO_UNLOCK(bo); bvec = (struct buf **) malloc(bveccount * sizeof(struct buf *), M_TEMP, M_NOWAIT); BO_LOCK(bo); if (bvec == NULL) { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } else bvecsize = bveccount; } else { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bvecpos >= bvecsize) break; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { nbp = TAILQ_NEXT(bp, b_bobufs); continue; } if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_NEEDCOMMIT)) { BUF_UNLOCK(bp); nbp = TAILQ_NEXT(bp, b_bobufs); continue; } BO_UNLOCK(bo); bremfree(bp); /* * Work out if all buffers are using the same cred * so we can deal with them all with one commit. * * NOTE: we are not clearing B_DONE here, so we have * to do it later on in this routine if we intend to * initiate I/O on the bp. * * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ if (wcred == NULL) wcred = bp->b_wcred; else if (wcred != bp->b_wcred) wcred = NOCRED; vfs_busy_pages(bp, 1); BO_LOCK(bo); /* * bp is protected by being locked, but nbp is not * and vfs_busy_pages() may sleep. We have to * recalculate nbp. */ nbp = TAILQ_NEXT(bp, b_bobufs); /* * A list of these buffers is kept so that the * second loop knows which buffers have actually * been committed. This is necessary, since there * may be a race between the commit rpc and new * uncommitted writes on the file. */ bvec[bvecpos++] = bp; toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; if (toff < off) off = toff; toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); if (toff > endoff) endoff = toff; } BO_UNLOCK(bo); } if (bvecpos > 0) { /* * Commit data on the server, as required. * If all bufs are using the same wcred, then use that with * one call for all of them, otherwise commit each one * separately. */ if (wcred != NOCRED) retv = ncl_commit(vp, off, (int)(endoff - off), wcred, td); else { retv = 0; for (i = 0; i < bvecpos; i++) { off_t off, size; bp = bvec[i]; off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; size = (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); retv = ncl_commit(vp, off, (int)size, bp->b_wcred, td); if (retv) break; } } if (retv == NFSERR_STALEWRITEVERF) ncl_clearcommit(vp->v_mount); /* * Now, either mark the blocks I/O done or mark the * blocks dirty, depending on whether the commit * succeeded. */ for (i = 0; i < bvecpos; i++) { bp = bvec[i]; bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); if (!NFSCL_FORCEDISM(vp->v_mount) && retv) { /* * Error, leave B_DELWRI intact */ vfs_unbusy_pages(bp); brelse(bp); } else { /* * Success, remove B_DELWRI ( bundirty() ). * * b_dirtyoff/b_dirtyend seem to be NFS * specific. We should probably move that * into bundirty(). XXX */ bufobj_wref(bo); bp->b_flags |= B_ASYNC; bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_dirtyoff = bp->b_dirtyend = 0; bufdone(bp); } } } /* * Start/do any write(s) that are required. */ loop: BO_LOCK(bo); TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { if (waitfor != MNT_WAIT || passone) continue; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "nfsfsync", slpflag, slptimeo); if (error == 0) { BUF_UNLOCK(bp); goto loop; } if (error == ENOLCK) { error = 0; goto loop; } if (called_from_renewthread != 0) { /* * Return EIO so the flush will be retried * later. */ error = EIO; goto done; } if (newnfs_sigintr(nmp, td)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } goto loop; } if ((bp->b_flags & B_DELWRI) == 0) panic("nfs_fsync: not dirty"); if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) { BUF_UNLOCK(bp); continue; } BO_UNLOCK(bo); bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); if (newnfs_sigintr(nmp, td)) { error = EINTR; goto done; } goto loop; } if (passone) { passone = 0; BO_UNLOCK(bo); goto again; } if (waitfor == MNT_WAIT) { while (bo->bo_numoutput) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); if (called_from_renewthread != 0) { /* * Return EIO so that the flush will be * retried later. */ error = EIO; goto done; } error = newnfs_sigintr(nmp, td); if (error) goto done; if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } BO_LOCK(bo); } } if (bo->bo_dirty.bv_cnt != 0 && commit) { BO_UNLOCK(bo); goto loop; } /* * Wait for all the async IO requests to drain */ BO_UNLOCK(bo); NFSLOCKNODE(np); while (np->n_directio_asyncwr > 0) { np->n_flag |= NFSYNCWAIT; error = newnfs_msleep(td, &np->n_directio_asyncwr, &np->n_mtx, slpflag | (PRIBIO + 1), "nfsfsync", 0); if (error) { if (newnfs_sigintr(nmp, td)) { NFSUNLOCKNODE(np); error = EINTR; goto done; } } } NFSUNLOCKNODE(np); } else BO_UNLOCK(bo); if (NFSHASPNFS(nmp)) { nfscl_layoutcommit(vp, td); /* * Invalidate the attribute cache, since writes to a DS * won't update the size attribute. */ NFSLOCKNODE(np); np->n_attrstamp = 0; } else NFSLOCKNODE(np); if (np->n_flag & NWRITEERR) { error = np->n_error; np->n_flag &= ~NWRITEERR; } if (commit && bo->bo_dirty.bv_cnt == 0 && bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0) np->n_flag &= ~NMODIFIED; NFSUNLOCKNODE(np); done: if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); if (error == 0 && commit != 0 && waitfor == MNT_WAIT && (bo->bo_dirty.bv_cnt != 0 || bo->bo_numoutput != 0 || np->n_directio_asyncwr != 0)) { if (trycnt++ < 5) { /* try, try again... */ passone = 1; wcred = NULL; bvec = NULL; bvecsize = 0; goto again; } vn_printf(vp, "ncl_flush failed"); error = called_from_renewthread != 0 ? EIO : EBUSY; } if (error == 0) { nanouptime(&ts); NFSLOCKNODE(np); np->n_localmodtime = ts; NFSUNLOCKNODE(np); } return (error); } /* * NFS advisory byte-level locks. */ static int nfs_advlock(struct vop_advlock_args *ap) { struct vnode *vp = ap->a_vp; struct ucred *cred; struct nfsnode *np = VTONFS(ap->a_vp); struct proc *p = (struct proc *)ap->a_id; struct thread *td = curthread; /* XXX */ struct vattr va; int ret, error; u_quad_t size; struct nfsmount *nmp; error = NFSVOPLOCK(vp, LK_SHARED); if (error != 0) return (EBADF); nmp = VFSTONFS(vp->v_mount); if (!NFS_ISV4(vp) || (nmp->nm_flag & NFSMNT_NOLOCKD) != 0) { if ((nmp->nm_flag & NFSMNT_NOLOCKD) != 0) { size = np->n_size; NFSVOPUNLOCK(vp); error = lf_advlock(ap, &(vp->v_lockf), size); } else { if (nfs_advlock_p != NULL) error = nfs_advlock_p(ap); else { NFSVOPUNLOCK(vp); error = ENOLCK; } } if (error == 0 && ap->a_op == F_SETLK) { error = NFSVOPLOCK(vp, LK_SHARED); if (error == 0) { /* Mark that a file lock has been acquired. */ NFSLOCKNODE(np); np->n_flag |= NHASBEENLOCKED; NFSUNLOCKNODE(np); NFSVOPUNLOCK(vp); } } return (error); } else if ((ap->a_flags & (F_POSIX | F_FLOCK)) != 0) { if (vp->v_type != VREG) { error = EINVAL; goto out; } if ((ap->a_flags & F_POSIX) != 0) cred = p->p_ucred; else cred = td->td_ucred; NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(vp)) { error = EBADF; goto out; } /* * If this is unlocking a write locked region, flush and * commit them before unlocking. This is required by * RFC3530 Sec. 9.3.2. */ if (ap->a_op == F_UNLCK && nfscl_checkwritelocked(vp, ap->a_fl, cred, td, ap->a_id, ap->a_flags)) (void) ncl_flush(vp, MNT_WAIT, td, 1, 0); /* * Mark NFS node as might have acquired a lock. * This is separate from NHASBEENLOCKED, because it must * be done before the nfsrpc_advlock() call, which might * add a nfscllock structure to the client state. * It is used to check for the case where a nfscllock * state structure cannot exist for the file. * Only done for "oneopenown" NFSv4.1/4.2 mounts. */ if (NFSHASNFSV4N(nmp) && NFSHASONEOPENOWN(nmp)) { NFSLOCKNODE(np); np->n_flag |= NMIGHTBELOCKED; NFSUNLOCKNODE(np); } /* * Loop around doing the lock op, while a blocking lock * must wait for the lock op to succeed. */ do { ret = nfsrpc_advlock(vp, np->n_size, ap->a_op, ap->a_fl, 0, cred, td, ap->a_id, ap->a_flags); if (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) && ap->a_op == F_SETLK) { NFSVOPUNLOCK(vp); error = nfs_catnap(PZERO | PCATCH, ret, "ncladvl"); if (error) return (EINTR); NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(vp)) { error = EBADF; goto out; } } } while (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) && ap->a_op == F_SETLK); if (ret == NFSERR_DENIED) { error = EAGAIN; goto out; } else if (ret == EINVAL || ret == EBADF || ret == EINTR) { error = ret; goto out; } else if (ret != 0) { error = EACCES; goto out; } /* * Now, if we just got a lock, invalidate data in the buffer * cache, as required, so that the coherency conforms with * RFC3530 Sec. 9.3.2. */ if (ap->a_op == F_SETLK) { if ((np->n_flag & NMODIFIED) == 0) { np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); ret = VOP_GETATTR(vp, &va, cred); } if ((np->n_flag & NMODIFIED) || ret || np->n_change != va.va_filerev) { (void) ncl_vinvalbuf(vp, V_SAVE, td, 1); np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); ret = VOP_GETATTR(vp, &va, cred); if (!ret) { np->n_mtime = va.va_mtime; np->n_change = va.va_filerev; } } /* Mark that a file lock has been acquired. */ NFSLOCKNODE(np); np->n_flag |= NHASBEENLOCKED; NFSUNLOCKNODE(np); } } else error = EOPNOTSUPP; out: NFSVOPUNLOCK(vp); return (error); } /* * NFS advisory byte-level locks. */ static int nfs_advlockasync(struct vop_advlockasync_args *ap) { struct vnode *vp = ap->a_vp; u_quad_t size; int error; if (NFS_ISV4(vp)) return (EOPNOTSUPP); error = NFSVOPLOCK(vp, LK_SHARED); if (error) return (error); if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) { size = VTONFS(vp)->n_size; NFSVOPUNLOCK(vp); error = lf_advlockasync(ap, &(vp->v_lockf), size); } else { NFSVOPUNLOCK(vp); error = EOPNOTSUPP; } return (error); } /* * Print out the contents of an nfsnode. */ static int nfs_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); printf("\tfileid %jd fsid 0x%jx", (uintmax_t)np->n_vattr.na_fileid, (uintmax_t)np->n_vattr.na_fsid); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * This is the "real" nfs::bwrite(struct buf*). * We set B_CACHE if this is a VMIO buffer. */ int ncl_writebp(struct buf *bp, int force __unused, struct thread *td) { int oldflags, rtval; if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; bp->b_flags |= B_CACHE; /* * Undirty the bp. We will redirty it later if the I/O fails. */ bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_iocmd = BIO_WRITE; bufobj_wref(bp->b_bufobj); curthread->td_ru.ru_oublock++; /* * Note: to avoid loopback deadlocks, we do not * assign b_runningbufspace. */ vfs_busy_pages(bp, 1); BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); if ((oldflags & B_ASYNC) != 0) return (0); rtval = bufwait(bp); if (oldflags & B_DELWRI) reassignbuf(bp); brelse(bp); return (rtval); } /* * nfs special file access vnode op. * Essentially just get vattr and then imitate iaccess() since the device is * local to the client. */ static int nfsspec_access(struct vop_access_args *ap) { struct vattr *vap; struct ucred *cred = ap->a_cred; struct vnode *vp = ap->a_vp; accmode_t accmode = ap->a_accmode; struct vattr vattr; int error; /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } vap = &vattr; error = VOP_GETATTR(vp, vap, cred); if (error) goto out; error = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid, accmode, cred); out: return error; } /* * Read wrapper for fifos. */ static int nfsfifo_read(struct vop_read_args *ap) { struct nfsnode *np = VTONFS(ap->a_vp); int error; /* * Set access flag. */ NFSLOCKNODE(np); np->n_flag |= NACC; vfs_timestamp(&np->n_atim); NFSUNLOCKNODE(np); error = fifo_specops.vop_read(ap); return error; } /* * Write wrapper for fifos. */ static int nfsfifo_write(struct vop_write_args *ap) { struct nfsnode *np = VTONFS(ap->a_vp); /* * Set update flag. */ NFSLOCKNODE(np); np->n_flag |= NUPD; vfs_timestamp(&np->n_mtim); NFSUNLOCKNODE(np); return(fifo_specops.vop_write(ap)); } /* * Close wrapper for fifos. * * Update the times on the nfsnode then do fifo close. */ static int nfsfifo_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct vattr vattr; struct timespec ts; NFSLOCKNODE(np); if (np->n_flag & (NACC | NUPD)) { vfs_timestamp(&ts); if (np->n_flag & NACC) np->n_atim = ts; if (np->n_flag & NUPD) np->n_mtim = ts; np->n_flag |= NCHG; if (vrefcnt(vp) == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; NFSUNLOCKNODE(np); (void)VOP_SETATTR(vp, &vattr, ap->a_cred); goto out; } } NFSUNLOCKNODE(np); out: return (fifo_specops.vop_close(ap)); } /* * Just call ncl_writebp() with the force argument set to 1. * * NOTE: B_DONE may or may not be set in a_bp on call. */ static int nfs_bwrite(struct buf *bp) { return (ncl_writebp(bp, 1, curthread)); } struct buf_ops buf_ops_newnfs = { .bop_name = "buf_ops_nfs", .bop_write = nfs_bwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; static int nfs_getacl(struct vop_getacl_args *ap) { int error; if (ap->a_type != ACL_TYPE_NFS4) return (EOPNOTSUPP); error = nfsrpc_getacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp); if (error > NFSERR_STALE) { (void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0); error = EPERM; } return (error); } static int nfs_setacl(struct vop_setacl_args *ap) { int error; if (ap->a_type != ACL_TYPE_NFS4) return (EOPNOTSUPP); error = nfsrpc_setacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp); if (error > NFSERR_STALE) { (void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0); error = EPERM; } return (error); } /* * VOP_ADVISE for NFS. * Just return 0 for any errors, since it is just a hint. */ static int nfs_advise(struct vop_advise_args *ap) { struct thread *td = curthread; struct nfsmount *nmp; uint64_t len; int error; /* * First do vop_stdadvise() to handle the buffer cache. */ error = vop_stdadvise(ap); if (error != 0) return (error); if (ap->a_start < 0 || ap->a_end < 0) return (0); if (ap->a_end == OFF_MAX) len = 0; else if (ap->a_end < ap->a_start) return (0); else len = ap->a_end - ap->a_start + 1; nmp = VFSTONFS(ap->a_vp->v_mount); mtx_lock(&nmp->nm_mtx); if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION || (NFSHASPNFS(nmp) && (nmp->nm_privflag & NFSMNTP_IOADVISETHRUMDS) == 0) || (nmp->nm_privflag & NFSMNTP_NOADVISE) != 0) { mtx_unlock(&nmp->nm_mtx); return (0); } mtx_unlock(&nmp->nm_mtx); error = nfsrpc_advise(ap->a_vp, ap->a_start, len, ap->a_advice, td->td_ucred, td); if (error == NFSERR_NOTSUPP) { mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_NOADVISE; mtx_unlock(&nmp->nm_mtx); } return (0); } /* * nfs allocate call */ static int nfs_allocate(struct vop_allocate_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = curthread; struct nfsvattr nfsva; struct nfsmount *nmp; struct nfsnode *np; off_t alen; int attrflag, error, ret; struct timespec ts; struct uio io; attrflag = 0; nmp = VFSTONFS(vp->v_mount); np = VTONFS(vp); mtx_lock(&nmp->nm_mtx); if (NFSHASNFSV4(nmp) && nmp->nm_minorvers >= NFSV42_MINORVERSION && (nmp->nm_privflag & NFSMNTP_NOALLOCATE) == 0) { mtx_unlock(&nmp->nm_mtx); alen = *ap->a_len; if ((uint64_t)alen > nfs_maxalloclen) alen = nfs_maxalloclen; /* Check the file size limit. */ io.uio_offset = *ap->a_offset; io.uio_resid = alen; error = vn_rlimit_fsize(vp, &io, td); /* * Flush first to ensure that the allocate adds to the * file's allocation on the server. */ if (error == 0) error = ncl_flush(vp, MNT_WAIT, td, 1, 0); if (error == 0) error = nfsrpc_allocate(vp, *ap->a_offset, alen, &nfsva, &attrflag, ap->a_cred, td); if (error == 0) { *ap->a_offset += alen; *ap->a_len -= alen; nanouptime(&ts); NFSLOCKNODE(np); np->n_localmodtime = ts; NFSUNLOCKNODE(np); } else if (error == NFSERR_NOTSUPP) { mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_NOALLOCATE; mtx_unlock(&nmp->nm_mtx); error = EINVAL; } } else { mtx_unlock(&nmp->nm_mtx); error = EINVAL; } if (attrflag != 0) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (error == 0 && ret != 0) error = ret; } if (error != 0) error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); return (error); } /* * nfs deallocate call */ static int nfs_deallocate(struct vop_deallocate_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = curthread; struct nfsvattr nfsva; struct nfsmount *nmp; struct nfsnode *np; off_t tlen, mlen; int attrflag, error, ret; bool clipped; struct timespec ts; error = 0; attrflag = 0; nmp = VFSTONFS(vp->v_mount); np = VTONFS(vp); mtx_lock(&nmp->nm_mtx); if (NFSHASNFSV4(nmp) && nmp->nm_minorvers >= NFSV42_MINORVERSION && (nmp->nm_privflag & NFSMNTP_NODEALLOCATE) == 0) { mtx_unlock(&nmp->nm_mtx); tlen = omin(OFF_MAX - *ap->a_offset, *ap->a_len); NFSCL_DEBUG(4, "dealloc: off=%jd len=%jd maxfilesize=%ju\n", (intmax_t)*ap->a_offset, (intmax_t)tlen, (uintmax_t)nmp->nm_maxfilesize); if ((uint64_t)*ap->a_offset >= nmp->nm_maxfilesize) { /* Avoid EFBIG error return from the NFSv4.2 server. */ *ap->a_len = 0; return (0); } clipped = false; if ((uint64_t)*ap->a_offset + tlen > nmp->nm_maxfilesize) tlen = nmp->nm_maxfilesize - *ap->a_offset; if ((uint64_t)*ap->a_offset < np->n_size) { /* Limit the len to nfs_maxalloclen before EOF. */ mlen = omin((off_t)np->n_size - *ap->a_offset, tlen); if ((uint64_t)mlen > nfs_maxalloclen) { NFSCL_DEBUG(4, "dealloc: tlen maxalloclen\n"); tlen = nfs_maxalloclen; clipped = true; } } if (error == 0) error = ncl_vinvalbuf(vp, V_SAVE, td, 1); if (error == 0) { vnode_pager_purge_range(vp, *ap->a_offset, *ap->a_offset + tlen); error = nfsrpc_deallocate(vp, *ap->a_offset, tlen, &nfsva, &attrflag, ap->a_cred, td); NFSCL_DEBUG(4, "dealloc: rpc=%d\n", error); } if (error == 0) { NFSCL_DEBUG(4, "dealloc: attrflag=%d na_size=%ju\n", attrflag, (uintmax_t)nfsva.na_size); nanouptime(&ts); NFSLOCKNODE(np); np->n_localmodtime = ts; NFSUNLOCKNODE(np); if (attrflag != 0) { if ((uint64_t)*ap->a_offset < nfsva.na_size) *ap->a_offset += omin((off_t) nfsva.na_size - *ap->a_offset, tlen); } if (clipped && tlen < *ap->a_len) *ap->a_len -= tlen; else *ap->a_len = 0; } else if (error == NFSERR_NOTSUPP) { mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_NODEALLOCATE; mtx_unlock(&nmp->nm_mtx); } } else { mtx_unlock(&nmp->nm_mtx); error = EIO; } /* * If the NFS server cannot perform the Deallocate operation, just call * vop_stddeallocate() to perform it. */ if (error != 0 && error != NFSERR_FBIG && error != NFSERR_INVAL) { error = vop_stddeallocate(ap); NFSCL_DEBUG(4, "dealloc: stddeallocate=%d\n", error); } if (attrflag != 0) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (error == 0 && ret != 0) error = ret; } if (error != 0) error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); return (error); } /* * nfs copy_file_range call */ static int nfs_copy_file_range(struct vop_copy_file_range_args *ap) { struct vnode *invp = ap->a_invp; struct vnode *outvp = ap->a_outvp; struct mount *mp; struct nfsvattr innfsva, outnfsva; struct vattr *vap; struct uio io; struct nfsmount *nmp; size_t len, len2; int error, inattrflag, outattrflag, ret, ret2; off_t inoff, outoff; bool consecutive, must_commit, tryoutcred; ret = ret2 = 0; nmp = VFSTONFS(invp->v_mount); mtx_lock(&nmp->nm_mtx); /* NFSv4.2 Copy is not permitted for infile == outfile. */ if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION || (nmp->nm_privflag & NFSMNTP_NOCOPY) != 0 || invp == outvp) { mtx_unlock(&nmp->nm_mtx); error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp, ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred, ap->a_outcred, ap->a_fsizetd); return (error); } mtx_unlock(&nmp->nm_mtx); /* Lock both vnodes, avoiding risk of deadlock. */ do { mp = NULL; error = vn_start_write(outvp, &mp, V_WAIT); if (error == 0) { error = vn_lock(outvp, LK_EXCLUSIVE); if (error == 0) { error = vn_lock(invp, LK_SHARED | LK_NOWAIT); if (error == 0) break; VOP_UNLOCK(outvp); if (mp != NULL) vn_finished_write(mp); mp = NULL; error = vn_lock(invp, LK_SHARED); if (error == 0) VOP_UNLOCK(invp); } } if (mp != NULL) vn_finished_write(mp); } while (error == 0); if (error != 0) return (error); /* * Do the vn_rlimit_fsize() check. Should this be above the VOP layer? */ io.uio_offset = *ap->a_outoffp; io.uio_resid = *ap->a_lenp; error = vn_rlimit_fsize(outvp, &io, ap->a_fsizetd); /* * Flush the input file so that the data is up to date before * the copy. Flush writes for the output file so that they * do not overwrite the data copied to the output file by the Copy. * Set the commit argument for both flushes so that the data is on * stable storage before the Copy RPC. This is done in case the * server reboots during the Copy and needs to be redone. */ if (error == 0) error = ncl_flush(invp, MNT_WAIT, curthread, 1, 0); if (error == 0) error = ncl_flush(outvp, MNT_WAIT, curthread, 1, 0); /* Do the actual NFSv4.2 RPC. */ len = *ap->a_lenp; mtx_lock(&nmp->nm_mtx); if ((nmp->nm_privflag & NFSMNTP_NOCONSECUTIVE) == 0) consecutive = true; else consecutive = false; mtx_unlock(&nmp->nm_mtx); inoff = *ap->a_inoffp; outoff = *ap->a_outoffp; tryoutcred = true; must_commit = false; if (error == 0) { vap = &VTONFS(invp)->n_vattr.na_vattr; error = VOP_GETATTR(invp, vap, ap->a_incred); if (error == 0) { /* * Clip "len" at va_size so that RFC compliant servers * will not reply NFSERR_INVAL. * Setting "len == 0" for the RPC would be preferred, * but some Linux servers do not support that. */ if (inoff >= vap->va_size) *ap->a_lenp = len = 0; else if (inoff + len > vap->va_size) *ap->a_lenp = len = vap->va_size - inoff; } else error = 0; } /* * len will be set to 0 upon a successful Copy RPC. * As such, this only loops when the Copy RPC needs to be retried. */ while (len > 0 && error == 0) { inattrflag = outattrflag = 0; len2 = len; if (tryoutcred) error = nfsrpc_copy_file_range(invp, ap->a_inoffp, outvp, ap->a_outoffp, &len2, ap->a_flags, &inattrflag, &innfsva, &outattrflag, &outnfsva, ap->a_outcred, consecutive, &must_commit); else error = nfsrpc_copy_file_range(invp, ap->a_inoffp, outvp, ap->a_outoffp, &len2, ap->a_flags, &inattrflag, &innfsva, &outattrflag, &outnfsva, ap->a_incred, consecutive, &must_commit); if (inattrflag != 0) ret = nfscl_loadattrcache(&invp, &innfsva, NULL, 0, 1); if (outattrflag != 0) ret2 = nfscl_loadattrcache(&outvp, &outnfsva, NULL, 1, 1); if (error == 0) { if (consecutive == false) { if (len2 == len) { mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_NOCONSECUTIVE; mtx_unlock(&nmp->nm_mtx); } else error = NFSERR_OFFLOADNOREQS; } *ap->a_lenp = len2; len = 0; if (len2 > 0 && must_commit && error == 0) error = ncl_commit(outvp, outoff, *ap->a_lenp, ap->a_outcred, curthread); if (error == 0 && ret != 0) error = ret; if (error == 0 && ret2 != 0) error = ret2; } else if (error == NFSERR_OFFLOADNOREQS && consecutive) { /* * Try consecutive == false, which is ok only if all * bytes are copied. * If only some bytes were copied when consecutive * is false, there is no way to know which bytes * still need to be written. */ consecutive = false; error = 0; } else if (error == NFSERR_ACCES && tryoutcred) { /* Try again with incred. */ tryoutcred = false; error = 0; } if (error == NFSERR_STALEWRITEVERF) { /* * Server rebooted, so do it all again. */ *ap->a_inoffp = inoff; *ap->a_outoffp = outoff; len = *ap->a_lenp; must_commit = false; error = 0; } } VOP_UNLOCK(invp); VOP_UNLOCK(outvp); if (mp != NULL) vn_finished_write(mp); if (error == NFSERR_NOTSUPP || error == NFSERR_OFFLOADNOREQS || error == NFSERR_ACCES) { /* * Unlike the NFSv4.2 Copy, vn_generic_copy_file_range() can * use a_incred for the read and a_outcred for the write, so * try this for NFSERR_ACCES failures for the Copy. * For NFSERR_NOTSUPP and NFSERR_OFFLOADNOREQS, the Copy can * never succeed, so disable it. */ if (error != NFSERR_ACCES) { /* Can never do Copy on this mount. */ mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_NOCOPY; mtx_unlock(&nmp->nm_mtx); } *ap->a_inoffp = inoff; *ap->a_outoffp = outoff; error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp, ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred, ap->a_outcred, ap->a_fsizetd); } else if (error != 0) *ap->a_lenp = 0; if (error != 0) error = nfscl_maperr(curthread, error, (uid_t)0, (gid_t)0); return (error); } /* * nfs ioctl call */ static int nfs_ioctl(struct vop_ioctl_args *ap) { struct vnode *vp = ap->a_vp; struct nfsvattr nfsva; struct nfsmount *nmp; int attrflag, content, error, ret; bool eof = false; /* shut up compiler. */ if (vp->v_type != VREG) return (ENOTTY); nmp = VFSTONFS(vp->v_mount); if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION) { error = vop_stdioctl(ap); return (error); } /* Do the actual NFSv4.2 RPC. */ switch (ap->a_command) { case FIOSEEKDATA: content = NFSV4CONTENT_DATA; break; case FIOSEEKHOLE: content = NFSV4CONTENT_HOLE; break; default: return (ENOTTY); } error = vn_lock(vp, LK_SHARED); if (error != 0) return (EBADF); attrflag = 0; if (*((off_t *)ap->a_data) >= VTONFS(vp)->n_size) error = ENXIO; else { /* * Flush all writes, so that the server is up to date. * Although a Commit is not required, the commit argument * is set so that, for a pNFS File/Flexible File Layout * server, the LayoutCommit will be done to ensure the file * size is up to date on the Metadata Server. */ error = ncl_flush(vp, MNT_WAIT, ap->a_td, 1, 0); if (error == 0) error = nfsrpc_seek(vp, (off_t *)ap->a_data, &eof, content, ap->a_cred, &nfsva, &attrflag); /* If at eof for FIOSEEKDATA, return ENXIO. */ if (eof && error == 0 && content == NFSV4CONTENT_DATA) error = ENXIO; } if (attrflag != 0) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (error == 0 && ret != 0) error = ret; } NFSVOPUNLOCK(vp); if (error != 0) error = ENXIO; return (error); } /* * nfs getextattr call */ static int nfs_getextattr(struct vop_getextattr_args *ap) { struct vnode *vp = ap->a_vp; struct nfsmount *nmp; struct ucred *cred; struct thread *td = ap->a_td; struct nfsvattr nfsva; ssize_t len; int attrflag, error, ret; nmp = VFSTONFS(vp->v_mount); mtx_lock(&nmp->nm_mtx); if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION || (nmp->nm_privflag & NFSMNTP_NOXATTR) != 0 || ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) { mtx_unlock(&nmp->nm_mtx); return (EOPNOTSUPP); } mtx_unlock(&nmp->nm_mtx); cred = ap->a_cred; if (cred == NULL) cred = td->td_ucred; /* Do the actual NFSv4.2 Optional Extended Attribute (RFC-8276) RPC. */ attrflag = 0; error = nfsrpc_getextattr(vp, ap->a_name, ap->a_uio, &len, &nfsva, &attrflag, cred, td); if (attrflag != 0) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (error == 0 && ret != 0) error = ret; } if (error == 0 && ap->a_size != NULL) *ap->a_size = len; switch (error) { case NFSERR_NOTSUPP: case NFSERR_OPILLEGAL: mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_NOXATTR; mtx_unlock(&nmp->nm_mtx); error = EOPNOTSUPP; break; case NFSERR_NOXATTR: case NFSERR_XATTR2BIG: error = ENOATTR; break; default: error = nfscl_maperr(td, error, 0, 0); break; } return (error); } /* * nfs setextattr call */ static int nfs_setextattr(struct vop_setextattr_args *ap) { struct vnode *vp = ap->a_vp; struct nfsmount *nmp; struct ucred *cred; struct thread *td = ap->a_td; struct nfsvattr nfsva; int attrflag, error, ret; nmp = VFSTONFS(vp->v_mount); mtx_lock(&nmp->nm_mtx); if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION || (nmp->nm_privflag & NFSMNTP_NOXATTR) != 0 || ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) { mtx_unlock(&nmp->nm_mtx); return (EOPNOTSUPP); } mtx_unlock(&nmp->nm_mtx); if (ap->a_uio->uio_resid < 0) return (EINVAL); cred = ap->a_cred; if (cred == NULL) cred = td->td_ucred; /* Do the actual NFSv4.2 Optional Extended Attribute (RFC-8276) RPC. */ attrflag = 0; error = nfsrpc_setextattr(vp, ap->a_name, ap->a_uio, &nfsva, &attrflag, cred, td); if (attrflag != 0) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (error == 0 && ret != 0) error = ret; } switch (error) { case NFSERR_NOTSUPP: case NFSERR_OPILLEGAL: mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_NOXATTR; mtx_unlock(&nmp->nm_mtx); error = EOPNOTSUPP; break; case NFSERR_NOXATTR: case NFSERR_XATTR2BIG: error = ENOATTR; break; default: error = nfscl_maperr(td, error, 0, 0); break; } return (error); } /* * nfs listextattr call */ static int nfs_listextattr(struct vop_listextattr_args *ap) { struct vnode *vp = ap->a_vp; struct nfsmount *nmp; struct ucred *cred; struct thread *td = ap->a_td; struct nfsvattr nfsva; size_t len, len2; uint64_t cookie; int attrflag, error, ret; bool eof; nmp = VFSTONFS(vp->v_mount); mtx_lock(&nmp->nm_mtx); if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION || (nmp->nm_privflag & NFSMNTP_NOXATTR) != 0 || ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) { mtx_unlock(&nmp->nm_mtx); return (EOPNOTSUPP); } mtx_unlock(&nmp->nm_mtx); cred = ap->a_cred; if (cred == NULL) cred = td->td_ucred; /* Loop around doing List Extended Attribute RPCs. */ eof = false; cookie = 0; len2 = 0; error = 0; while (!eof && error == 0) { len = nmp->nm_rsize; attrflag = 0; error = nfsrpc_listextattr(vp, &cookie, ap->a_uio, &len, &eof, &nfsva, &attrflag, cred, td); if (attrflag != 0) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (error == 0 && ret != 0) error = ret; } if (error == 0) { len2 += len; if (len2 > SSIZE_MAX) error = ENOATTR; } } if (error == 0 && ap->a_size != NULL) *ap->a_size = len2; switch (error) { case NFSERR_NOTSUPP: case NFSERR_OPILLEGAL: mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_NOXATTR; mtx_unlock(&nmp->nm_mtx); error = EOPNOTSUPP; break; case NFSERR_NOXATTR: case NFSERR_XATTR2BIG: error = ENOATTR; break; default: error = nfscl_maperr(td, error, 0, 0); break; } return (error); } /* * nfs setextattr call */ static int nfs_deleteextattr(struct vop_deleteextattr_args *ap) { struct vnode *vp = ap->a_vp; struct nfsmount *nmp; struct nfsvattr nfsva; int attrflag, error, ret; nmp = VFSTONFS(vp->v_mount); mtx_lock(&nmp->nm_mtx); if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION || (nmp->nm_privflag & NFSMNTP_NOXATTR) != 0 || ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) { mtx_unlock(&nmp->nm_mtx); return (EOPNOTSUPP); } mtx_unlock(&nmp->nm_mtx); /* Do the actual NFSv4.2 Optional Extended Attribute (RFC-8276) RPC. */ attrflag = 0; error = nfsrpc_rmextattr(vp, ap->a_name, &nfsva, &attrflag, ap->a_cred, ap->a_td); if (attrflag != 0) { ret = nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (error == 0 && ret != 0) error = ret; } switch (error) { case NFSERR_NOTSUPP: case NFSERR_OPILLEGAL: mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_NOXATTR; mtx_unlock(&nmp->nm_mtx); error = EOPNOTSUPP; break; case NFSERR_NOXATTR: case NFSERR_XATTR2BIG: error = ENOATTR; break; default: error = nfscl_maperr(ap->a_td, error, 0, 0); break; } return (error); } /* * Return POSIX pathconf information applicable to nfs filesystems. */ static int nfs_pathconf(struct vop_pathconf_args *ap) { struct nfsv3_pathconf pc; struct nfsvattr nfsva; struct vnode *vp = ap->a_vp; struct nfsmount *nmp; struct thread *td = curthread; off_t off; bool eof; int attrflag, error; if ((NFS_ISV34(vp) && (ap->a_name == _PC_LINK_MAX || ap->a_name == _PC_NAME_MAX || ap->a_name == _PC_CHOWN_RESTRICTED || ap->a_name == _PC_NO_TRUNC)) || (NFS_ISV4(vp) && ap->a_name == _PC_ACL_NFS4)) { /* * Since only the above 4 a_names are returned by the NFSv3 * Pathconf RPC, there is no point in doing it for others. * For NFSv4, the Pathconf RPC (actually a Getattr Op.) can * be used for _PC_NFS4_ACL as well. */ error = nfsrpc_pathconf(vp, &pc, td->td_ucred, td, &nfsva, &attrflag); if (attrflag != 0) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); if (error != 0) return (error); } else { /* * For NFSv2 (or NFSv3 when not one of the above 4 a_names), * just fake them. */ pc.pc_linkmax = NFS_LINK_MAX; pc.pc_namemax = NFS_MAXNAMLEN; pc.pc_notrunc = 1; pc.pc_chownrestricted = 1; pc.pc_caseinsensitive = 0; pc.pc_casepreserving = 1; error = 0; } switch (ap->a_name) { case _PC_LINK_MAX: #ifdef _LP64 *ap->a_retval = pc.pc_linkmax; #else *ap->a_retval = MIN(LONG_MAX, pc.pc_linkmax); #endif break; case _PC_NAME_MAX: *ap->a_retval = pc.pc_namemax; break; case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) *ap->a_retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = pc.pc_chownrestricted; break; case _PC_NO_TRUNC: *ap->a_retval = pc.pc_notrunc; break; case _PC_ACL_NFS4: if (NFS_ISV4(vp) && nfsrv_useacl != 0 && attrflag != 0 && NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL)) *ap->a_retval = 1; else *ap->a_retval = 0; break; case _PC_ACL_PATH_MAX: if (NFS_ISV4(vp)) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; break; case _PC_PRIO_IO: *ap->a_retval = 0; break; case _PC_SYNC_IO: *ap->a_retval = 0; break; case _PC_ALLOC_SIZE_MIN: *ap->a_retval = vp->v_mount->mnt_stat.f_bsize; break; case _PC_FILESIZEBITS: if (NFS_ISV34(vp)) *ap->a_retval = 64; else *ap->a_retval = 32; break; case _PC_REC_INCR_XFER_SIZE: *ap->a_retval = vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_MAX_XFER_SIZE: *ap->a_retval = -1; /* means ``unlimited'' */ break; case _PC_REC_MIN_XFER_SIZE: *ap->a_retval = vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_XFER_ALIGN: *ap->a_retval = PAGE_SIZE; break; case _PC_SYMLINK_MAX: *ap->a_retval = NFS_MAXPATHLEN; break; case _PC_MIN_HOLE_SIZE: /* Only some NFSv4.2 servers support Seek for Holes. */ *ap->a_retval = 0; nmp = VFSTONFS(vp->v_mount); if (NFS_ISV4(vp) && nmp->nm_minorvers == NFSV42_MINORVERSION) { /* * NFSv4.2 doesn't have an attribute for hole size, * so all we can do is see if the Seek operation is * supported and then use f_iosize as a "best guess". */ mtx_lock(&nmp->nm_mtx); if ((nmp->nm_privflag & NFSMNTP_SEEKTESTED) == 0) { mtx_unlock(&nmp->nm_mtx); off = 0; attrflag = 0; error = nfsrpc_seek(vp, &off, &eof, NFSV4CONTENT_HOLE, td->td_ucred, &nfsva, &attrflag); if (attrflag != 0) nfscl_loadattrcache(&vp, &nfsva, NULL, 0, 1); mtx_lock(&nmp->nm_mtx); if (error == NFSERR_NOTSUPP) nmp->nm_privflag |= NFSMNTP_SEEKTESTED; else nmp->nm_privflag |= NFSMNTP_SEEKTESTED | NFSMNTP_SEEK; error = 0; } if ((nmp->nm_privflag & NFSMNTP_SEEK) != 0) *ap->a_retval = vp->v_mount->mnt_stat.f_iosize; mtx_unlock(&nmp->nm_mtx); } break; default: error = vop_stdpathconf(ap); break; } return (error); } diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c index 9fc475ac7ecb..8e15237bc10c 100644 --- a/sys/fs/nfsserver/nfs_nfsdport.c +++ b/sys/fs/nfsserver/nfs_nfsdport.c @@ -1,7127 +1,7120 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include /* * Functions that perform the vfs operations required by the routines in * nfsd_serv.c. It is hoped that this change will make the server more * portable. */ #include #include #include #include #include #include #include #include FEATURE(nfsd, "NFSv4 server"); extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1; extern int nfsrv_useacl; extern int newnfs_numnfsd; extern struct mount nfsv4root_mnt; extern struct nfsrv_stablefirst nfsrv_stablefirst; extern SVCPOOL *nfsrvd_pool; extern struct nfsv4lock nfsd_suspend_lock; extern struct nfsclienthashhead *nfsclienthash; extern struct nfslockhashhead *nfslockhash; extern struct nfssessionhash *nfssessionhash; extern int nfsrv_sessionhashsize; extern struct nfsstatsv1 nfsstatsv1; extern struct nfslayouthash *nfslayouthash; extern int nfsrv_layouthashsize; extern struct mtx nfsrv_dslock_mtx; extern int nfs_pnfsiothreads; extern struct nfsdontlisthead nfsrv_dontlisthead; extern volatile int nfsrv_dontlistlen; extern volatile int nfsrv_devidcnt; extern int nfsrv_maxpnfsmirror; extern uint32_t nfs_srvmaxio; extern int nfs_bufpackets; extern u_long sb_max_adj; struct vfsoptlist nfsv4root_opt, nfsv4root_newopt; NFSDLOCKMUTEX; NFSSTATESPINLOCK; struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE]; struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE]; struct mtx nfsrc_udpmtx; struct mtx nfs_v4root_mutex; struct mtx nfsrv_dontlistlock_mtx; struct mtx nfsrv_recalllock_mtx; struct nfsrvfh nfs_rootfh, nfs_pubfh; int nfs_pubfhset = 0, nfs_rootfhset = 0; struct proc *nfsd_master_proc = NULL; int nfsd_debuglevel = 0; static pid_t nfsd_master_pid = (pid_t)-1; static char nfsd_master_comm[MAXCOMLEN + 1]; static struct timeval nfsd_master_start; static uint32_t nfsv4_sysid = 0; static fhandle_t zerofh; struct callout nfsd_callout; static int nfssvc_srvcall(struct thread *, struct nfssvc_args *, struct ucred *); int nfsrv_enable_crossmntpt = 1; static int nfs_commit_blks; static int nfs_commit_miss; extern int nfsrv_issuedelegs; extern int nfsrv_dolocallocks; extern int nfsd_enable_stringtouid; extern struct nfsdevicehead nfsrv_devidhead; static int nfsrv_createiovec(int, struct mbuf **, struct mbuf **, struct iovec **); static int nfsrv_createiovec_extpgs(int, int, struct mbuf **, struct mbuf **, struct iovec **); static int nfsrv_createiovecw(int, struct mbuf *, char *, struct iovec **, int *); static void nfsrv_pnfscreate(struct vnode *, struct vattr *, struct ucred *, NFSPROC_T *); static void nfsrv_pnfsremovesetup(struct vnode *, NFSPROC_T *, struct vnode **, int *, char *, fhandle_t *); static void nfsrv_pnfsremove(struct vnode **, int, char *, fhandle_t *, NFSPROC_T *); static int nfsrv_proxyds(struct vnode *, off_t, int, struct ucred *, struct thread *, int, struct mbuf **, char *, struct mbuf **, struct nfsvattr *, struct acl *, off_t *, int, bool *); static int nfsrv_setextattr(struct vnode *, struct nfsvattr *, NFSPROC_T *); static int nfsrv_readdsrpc(fhandle_t *, off_t, int, struct ucred *, NFSPROC_T *, struct nfsmount *, struct mbuf **, struct mbuf **); static int nfsrv_writedsrpc(fhandle_t *, off_t, int, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, struct mbuf **, char *, int *); static int nfsrv_allocatedsrpc(fhandle_t *, off_t, off_t, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, int *); static int nfsrv_deallocatedsrpc(fhandle_t *, off_t, off_t, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, int *); static int nfsrv_setacldsrpc(fhandle_t *, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, struct acl *, int *); static int nfsrv_setattrdsrpc(fhandle_t *, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, struct nfsvattr *, int *); static int nfsrv_getattrdsrpc(fhandle_t *, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount *, struct nfsvattr *); static int nfsrv_seekdsrpc(fhandle_t *, off_t *, int, bool *, struct ucred *, NFSPROC_T *, struct nfsmount *); static int nfsrv_putfhname(fhandle_t *, char *); static int nfsrv_pnfslookupds(struct vnode *, struct vnode *, struct pnfsdsfile *, struct vnode **, NFSPROC_T *); static void nfsrv_pnfssetfh(struct vnode *, struct pnfsdsfile *, char *, char *, struct vnode *, NFSPROC_T *); static int nfsrv_dsremove(struct vnode *, char *, struct ucred *, NFSPROC_T *); static int nfsrv_dssetacl(struct vnode *, struct acl *, struct ucred *, NFSPROC_T *); static int nfsrv_pnfsstatfs(struct statfs *, struct mount *); int nfs_pnfsio(task_fn_t *, void *); SYSCTL_NODE(_vfs, OID_AUTO, nfsd, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "NFS server"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, mirrormnt, CTLFLAG_RW, &nfsrv_enable_crossmntpt, 0, "Enable nfsd to cross mount points"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks, 0, ""); SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss, 0, ""); SYSCTL_INT(_vfs_nfsd, OID_AUTO, issue_delegations, CTLFLAG_RW, &nfsrv_issuedelegs, 0, "Enable nfsd to issue delegations"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLFLAG_RW, &nfsrv_dolocallocks, 0, "Enable nfsd to acquire local locks on files"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, debuglevel, CTLFLAG_RW, &nfsd_debuglevel, 0, "Debug level for NFS server"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_stringtouid, CTLFLAG_RW, &nfsd_enable_stringtouid, 0, "Enable nfsd to accept numeric owner_names"); static int nfsrv_pnfsgetdsattr = 1; SYSCTL_INT(_vfs_nfsd, OID_AUTO, pnfsgetdsattr, CTLFLAG_RW, &nfsrv_pnfsgetdsattr, 0, "When set getattr gets DS attributes via RPC"); /* * nfsrv_dsdirsize can only be increased and only when the nfsd threads are * not running. * The dsN subdirectories for the increased values must have been created * on all DS servers before this increase is done. */ u_int nfsrv_dsdirsize = 20; static int sysctl_dsdirsize(SYSCTL_HANDLER_ARGS) { int error, newdsdirsize; newdsdirsize = nfsrv_dsdirsize; error = sysctl_handle_int(oidp, &newdsdirsize, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (newdsdirsize <= nfsrv_dsdirsize || newdsdirsize > 10000 || newnfs_numnfsd != 0) return (EINVAL); nfsrv_dsdirsize = newdsdirsize; return (0); } SYSCTL_PROC(_vfs_nfsd, OID_AUTO, dsdirsize, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrv_dsdirsize), sysctl_dsdirsize, "IU", "Number of dsN subdirs on the DS servers"); /* * nfs_srvmaxio can only be increased and only when the nfsd threads are * not running. The setting must be a power of 2, with the current limit of * 1Mbyte. */ static int sysctl_srvmaxio(SYSCTL_HANDLER_ARGS) { int error; u_int newsrvmaxio; uint64_t tval; newsrvmaxio = nfs_srvmaxio; error = sysctl_handle_int(oidp, &newsrvmaxio, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (newsrvmaxio == nfs_srvmaxio) return (0); if (newsrvmaxio < nfs_srvmaxio) { printf("nfsd: vfs.nfsd.srvmaxio can only be increased\n"); return (EINVAL); } if (newsrvmaxio > 1048576) { printf("nfsd: vfs.nfsd.srvmaxio cannot be > 1Mbyte\n"); return (EINVAL); } if ((newsrvmaxio & (newsrvmaxio - 1)) != 0) { printf("nfsd: vfs.nfsd.srvmaxio must be a power of 2\n"); return (EINVAL); } /* * Check that kern.ipc.maxsockbuf is large enough for * newsrviomax, given the setting of vfs.nfs.bufpackets. */ if ((newsrvmaxio + NFS_MAXXDR) * nfs_bufpackets > sb_max_adj) { /* * Suggest vfs.nfs.bufpackets * maximum RPC message for * sb_max_adj. */ tval = (newsrvmaxio + NFS_MAXXDR) * nfs_bufpackets; /* * Convert suggested sb_max_adj value to a suggested * sb_max value, which is what is set via kern.ipc.maxsockbuf. * Perform the inverse calculation of (from uipc_sockbuf.c): * sb_max_adj = (u_quad_t)sb_max * MCLBYTES / * (MSIZE + MCLBYTES); * XXX If the calculation of sb_max_adj from sb_max changes, * this calculation must be changed as well. */ tval *= (MSIZE + MCLBYTES); /* Brackets for readability. */ tval += MCLBYTES - 1; /* Round up divide. */ tval /= MCLBYTES; printf("nfsd: set kern.ipc.maxsockbuf to a minimum of " "%ju to support %ubyte NFS I/O\n", (uintmax_t)tval, newsrvmaxio); return (EINVAL); } NFSD_LOCK(); if (newnfs_numnfsd != 0) { NFSD_UNLOCK(); printf("nfsd: cannot set vfs.nfsd.srvmaxio when nfsd " "threads are running\n"); return (EINVAL); } nfs_srvmaxio = newsrvmaxio; NFSD_UNLOCK(); return (0); } SYSCTL_PROC(_vfs_nfsd, OID_AUTO, srvmaxio, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_srvmaxio, "IU", "Maximum I/O size in bytes"); #define MAX_REORDERED_RPC 16 #define NUM_HEURISTIC 1031 #define NHUSE_INIT 64 #define NHUSE_INC 16 #define NHUSE_MAX 2048 static struct nfsheur { struct vnode *nh_vp; /* vp to match (unreferenced pointer) */ off_t nh_nextoff; /* next offset for sequential detection */ int nh_use; /* use count for selection */ int nh_seqcount; /* heuristic */ } nfsheur[NUM_HEURISTIC]; /* * Heuristic to detect sequential operation. */ static struct nfsheur * nfsrv_sequential_heuristic(struct uio *uio, struct vnode *vp) { struct nfsheur *nh; int hi, try; /* Locate best candidate. */ try = 32; hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC; nh = &nfsheur[hi]; while (try--) { if (nfsheur[hi].nh_vp == vp) { nh = &nfsheur[hi]; break; } if (nfsheur[hi].nh_use > 0) --nfsheur[hi].nh_use; hi = (hi + 1) % NUM_HEURISTIC; if (nfsheur[hi].nh_use < nh->nh_use) nh = &nfsheur[hi]; } /* Initialize hint if this is a new file. */ if (nh->nh_vp != vp) { nh->nh_vp = vp; nh->nh_nextoff = uio->uio_offset; nh->nh_use = NHUSE_INIT; if (uio->uio_offset == 0) nh->nh_seqcount = 4; else nh->nh_seqcount = 1; } /* Calculate heuristic. */ if ((uio->uio_offset == 0 && nh->nh_seqcount > 0) || uio->uio_offset == nh->nh_nextoff) { /* See comments in vfs_vnops.c:sequential_heuristic(). */ nh->nh_seqcount += howmany(uio->uio_resid, 16384); if (nh->nh_seqcount > IO_SEQMAX) nh->nh_seqcount = IO_SEQMAX; } else if (qabs(uio->uio_offset - nh->nh_nextoff) <= MAX_REORDERED_RPC * imax(vp->v_mount->mnt_stat.f_iosize, uio->uio_resid)) { /* Probably a reordered RPC, leave seqcount alone. */ } else if (nh->nh_seqcount > 1) { nh->nh_seqcount /= 2; } else { nh->nh_seqcount = 0; } nh->nh_use += NHUSE_INC; if (nh->nh_use > NHUSE_MAX) nh->nh_use = NHUSE_MAX; return (nh); } /* * Get attributes into nfsvattr structure. */ int nfsvno_getattr(struct vnode *vp, struct nfsvattr *nvap, struct nfsrv_descript *nd, struct thread *p, int vpislocked, nfsattrbit_t *attrbitp) { int error, gotattr, lockedit = 0; struct nfsvattr na; if (vpislocked == 0) { /* * When vpislocked == 0, the vnode is either exclusively * locked by this thread or not locked by this thread. * As such, shared lock it, if not exclusively locked. */ if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) { lockedit = 1; NFSVOPLOCK(vp, LK_SHARED | LK_RETRY); } } /* * Acquire the Change, Size, TimeAccess, TimeModify and SpaceUsed * attributes, as required. * This needs to be done for regular files if: * - non-NFSv4 RPCs or * - when attrbitp == NULL or * - an NFSv4 RPC with any of the above attributes in attrbitp. * A return of 0 for nfsrv_proxyds() indicates that it has acquired * these attributes. nfsrv_proxyds() will return an error if the * server is not a pNFS one. */ gotattr = 0; if (vp->v_type == VREG && nfsrv_devidcnt > 0 && (attrbitp == NULL || (nd->nd_flag & ND_NFSV4) == 0 || NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_CHANGE) || NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SIZE) || NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_TIMEACCESS) || NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_TIMEMODIFY) || NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACEUSED))) { error = nfsrv_proxyds(vp, 0, 0, nd->nd_cred, p, NFSPROC_GETATTR, NULL, NULL, NULL, &na, NULL, NULL, 0, NULL); if (error == 0) gotattr = 1; } error = VOP_GETATTR(vp, &nvap->na_vattr, nd->nd_cred); if (lockedit != 0) NFSVOPUNLOCK(vp); /* * If we got the Change, Size and Modify Time from the DS, * replace them. */ if (gotattr != 0) { nvap->na_atime = na.na_atime; nvap->na_mtime = na.na_mtime; nvap->na_filerev = na.na_filerev; nvap->na_size = na.na_size; nvap->na_bytes = na.na_bytes; } NFSD_DEBUG(4, "nfsvno_getattr: gotattr=%d err=%d chg=%ju\n", gotattr, error, (uintmax_t)na.na_filerev); NFSEXITCODE(error); return (error); } /* * Get a file handle for a vnode. */ int nfsvno_getfh(struct vnode *vp, fhandle_t *fhp, struct thread *p) { int error; NFSBZERO((caddr_t)fhp, sizeof(fhandle_t)); fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fhp->fh_fid); NFSEXITCODE(error); return (error); } /* * Perform access checking for vnodes obtained from file handles that would * refer to files already opened by a Unix client. You cannot just use * vn_writechk() and VOP_ACCESSX() for two reasons. * 1 - You must check for exported rdonly as well as MNT_RDONLY for the write * case. * 2 - The owner is to be given access irrespective of mode bits for some * operations, so that processes that chmod after opening a file don't * break. */ int nfsvno_accchk(struct vnode *vp, accmode_t accmode, struct ucred *cred, struct nfsexstuff *exp, struct thread *p, int override, int vpislocked, u_int32_t *supportedtypep) { struct vattr vattr; int error = 0, getret = 0; if (vpislocked == 0) { if (NFSVOPLOCK(vp, LK_SHARED) != 0) { error = EPERM; goto out; } } if (accmode & VWRITE) { /* Just vn_writechk() changed to check rdonly */ /* * Disallow write attempts on read-only file systems; * unless the file is a socket or a block or character * device resident on the file system. */ if (NFSVNO_EXRDONLY(exp) || (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: error = EROFS; default: break; } } /* * If there's shared text associated with * the inode, try to free it up once. If * we fail, we can't allow writing. */ if (VOP_IS_TEXT(vp) && error == 0) error = ETXTBSY; } if (error != 0) { if (vpislocked == 0) NFSVOPUNLOCK(vp); goto out; } /* * Should the override still be applied when ACLs are enabled? */ error = VOP_ACCESSX(vp, accmode, cred, p); if (error != 0 && (accmode & (VDELETE | VDELETE_CHILD))) { /* * Try again with VEXPLICIT_DENY, to see if the test for * deletion is supported. */ error = VOP_ACCESSX(vp, accmode | VEXPLICIT_DENY, cred, p); if (error == 0) { if (vp->v_type == VDIR) { accmode &= ~(VDELETE | VDELETE_CHILD); accmode |= VWRITE; error = VOP_ACCESSX(vp, accmode, cred, p); } else if (supportedtypep != NULL) { *supportedtypep &= ~NFSACCESS_DELETE; } } } /* * Allow certain operations for the owner (reads and writes * on files that are already open). */ if (override != NFSACCCHK_NOOVERRIDE && (error == EPERM || error == EACCES)) { if (cred->cr_uid == 0 && (override & NFSACCCHK_ALLOWROOT)) error = 0; else if (override & NFSACCCHK_ALLOWOWNER) { getret = VOP_GETATTR(vp, &vattr, cred); if (getret == 0 && cred->cr_uid == vattr.va_uid) error = 0; } } if (vpislocked == 0) NFSVOPUNLOCK(vp); out: NFSEXITCODE(error); return (error); } /* * Set attribute(s) vnop. */ int nfsvno_setattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { u_quad_t savsize = 0; int error, savedit; time_t savbtime; /* * If this is an exported file system and a pNFS service is running, * don't VOP_SETATTR() of size for the MDS file system. */ savedit = 0; error = 0; if (vp->v_type == VREG && (vp->v_mount->mnt_flag & MNT_EXPORTED) != 0 && nfsrv_devidcnt != 0 && nvap->na_vattr.va_size != VNOVAL && nvap->na_vattr.va_size > 0) { savsize = nvap->na_vattr.va_size; nvap->na_vattr.va_size = VNOVAL; if (nvap->na_vattr.va_uid != (uid_t)VNOVAL || nvap->na_vattr.va_gid != (gid_t)VNOVAL || nvap->na_vattr.va_mode != (mode_t)VNOVAL || nvap->na_vattr.va_atime.tv_sec != VNOVAL || nvap->na_vattr.va_mtime.tv_sec != VNOVAL) savedit = 1; else savedit = 2; } if (savedit != 2) error = VOP_SETATTR(vp, &nvap->na_vattr, cred); if (savedit != 0) nvap->na_vattr.va_size = savsize; if (error == 0 && (nvap->na_vattr.va_uid != (uid_t)VNOVAL || nvap->na_vattr.va_gid != (gid_t)VNOVAL || nvap->na_vattr.va_size != VNOVAL || nvap->na_vattr.va_mode != (mode_t)VNOVAL || nvap->na_vattr.va_atime.tv_sec != VNOVAL || nvap->na_vattr.va_mtime.tv_sec != VNOVAL)) { /* Never modify birthtime on a DS file. */ savbtime = nvap->na_vattr.va_birthtime.tv_sec; nvap->na_vattr.va_birthtime.tv_sec = VNOVAL; /* For a pNFS server, set the attributes on the DS file. */ error = nfsrv_proxyds(vp, 0, 0, cred, p, NFSPROC_SETATTR, NULL, NULL, NULL, nvap, NULL, NULL, 0, NULL); nvap->na_vattr.va_birthtime.tv_sec = savbtime; if (error == ENOENT) error = 0; } NFSEXITCODE(error); return (error); } /* * Set up nameidata for a lookup() call and do it. */ int nfsvno_namei(struct nfsrv_descript *nd, struct nameidata *ndp, struct vnode *dp, int islocked, struct nfsexstuff *exp, struct vnode **retdirp) { struct componentname *cnp = &ndp->ni_cnd; int i; struct iovec aiov; struct uio auio; int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0, linklen; int error = 0; char *cp; *retdirp = NULL; cnp->cn_nameptr = cnp->cn_pnbuf; ndp->ni_lcf = 0; /* * Extract and set starting directory. */ if (dp->v_type != VDIR) { if (islocked) vput(dp); else vrele(dp); nfsvno_relpathbuf(ndp); error = ENOTDIR; goto out1; } if (islocked) NFSVOPUNLOCK(dp); VREF(dp); *retdirp = dp; if (NFSVNO_EXRDONLY(exp)) cnp->cn_flags |= RDONLY; ndp->ni_segflg = UIO_SYSSPACE; if (nd->nd_flag & ND_PUBLOOKUP) { ndp->ni_loopcnt = 0; if (cnp->cn_pnbuf[0] == '/') { vrele(dp); /* * Check for degenerate pathnames here, since lookup() * panics on them. */ for (i = 1; i < ndp->ni_pathlen; i++) if (cnp->cn_pnbuf[i] != '/') break; if (i == ndp->ni_pathlen) { error = NFSERR_ACCES; goto out; } dp = rootvnode; VREF(dp); } } else if ((nfsrv_enable_crossmntpt == 0 && NFSVNO_EXPORTED(exp)) || (nd->nd_flag & ND_NFSV4) == 0) { /* * Only cross mount points for NFSv4 when doing a * mount while traversing the file system above * the mount point, unless nfsrv_enable_crossmntpt is set. */ cnp->cn_flags |= NOCROSSMOUNT; } /* * Initialize for scan, set ni_startdir and bump ref on dp again * because lookup() will dereference ni_startdir. */ ndp->ni_startdir = dp; ndp->ni_rootdir = rootvnode; ndp->ni_topdir = NULL; if (!lockleaf) cnp->cn_flags |= LOCKLEAF; for (;;) { cnp->cn_nameptr = cnp->cn_pnbuf; /* * Call lookup() to do the real work. If an error occurs, * ndp->ni_vp and ni_dvp are left uninitialized or NULL and * we do not have to dereference anything before returning. * In either case ni_startdir will be dereferenced and NULLed * out. */ error = vfs_lookup(ndp); if (error) break; /* * Check for encountering a symbolic link. Trivial * termination occurs if no symlink encountered. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { - if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) - nfsvno_relpathbuf(ndp); if (ndp->ni_vp && !lockleaf) NFSVOPUNLOCK(ndp->ni_vp); break; } /* * Validate symlink */ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) NFSVOPUNLOCK(ndp->ni_dvp); if (!(nd->nd_flag & ND_PUBLOOKUP)) { error = EINVAL; goto badlink2; } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; goto badlink2; } if (ndp->ni_pathlen > 1) cp = uma_zalloc(namei_zone, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = NULL; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { badlink1: if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); badlink2: vrele(ndp->ni_dvp); vput(ndp->ni_vp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { error = ENOENT; goto badlink1; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { error = ENAMETOOLONG; goto badlink1; } /* * Adjust or replace path */ if (ndp->ni_pathlen > 1) { NFSBCOPY(ndp->ni_next, cp + linklen, ndp->ni_pathlen); uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; /* * Cleanup refs for next loop and check if root directory * should replace current directory. Normally ni_dvp * becomes the new base directory and is cleaned up when * we loop. Explicitly null pointers after invalidation * to clarify operation. */ vput(ndp->ni_vp); ndp->ni_vp = NULL; if (cnp->cn_pnbuf[0] == '/') { vrele(ndp->ni_dvp); ndp->ni_dvp = ndp->ni_rootdir; VREF(ndp->ni_dvp); } ndp->ni_startdir = ndp->ni_dvp; ndp->ni_dvp = NULL; } if (!lockleaf) cnp->cn_flags &= ~LOCKLEAF; out: if (error) { nfsvno_relpathbuf(ndp); ndp->ni_vp = NULL; ndp->ni_dvp = NULL; ndp->ni_startdir = NULL; } else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) { ndp->ni_dvp = NULL; } out1: NFSEXITCODE2(error, nd); return (error); } /* * Set up a pathname buffer and return a pointer to it and, optionally * set a hash pointer. */ void nfsvno_setpathbuf(struct nameidata *ndp, char **bufpp, u_long **hashpp) { struct componentname *cnp = &ndp->ni_cnd; - cnp->cn_flags |= (NOMACCHECK | HASBUF); + cnp->cn_flags |= (NOMACCHECK); cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); if (hashpp != NULL) *hashpp = NULL; *bufpp = cnp->cn_pnbuf; } /* * Release the above path buffer, if not released by nfsvno_namei(). */ void nfsvno_relpathbuf(struct nameidata *ndp) { - if ((ndp->ni_cnd.cn_flags & HASBUF) == 0) - panic("nfsrelpath"); uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); - ndp->ni_cnd.cn_flags &= ~HASBUF; + ndp->ni_cnd.cn_pnbuf = NULL; } /* * Readlink vnode op into an mbuf list. */ int nfsvno_readlink(struct vnode *vp, struct ucred *cred, int maxextsiz, struct thread *p, struct mbuf **mpp, struct mbuf **mpendp, int *lenp) { struct iovec *iv; struct uio io, *uiop = &io; struct mbuf *mp, *mp3; int len, tlen, error = 0; len = NFS_MAXPATHLEN; if (maxextsiz > 0) uiop->uio_iovcnt = nfsrv_createiovec_extpgs(len, maxextsiz, &mp3, &mp, &iv); else uiop->uio_iovcnt = nfsrv_createiovec(len, &mp3, &mp, &iv); uiop->uio_iov = iv; uiop->uio_offset = 0; uiop->uio_resid = len; uiop->uio_rw = UIO_READ; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = NULL; error = VOP_READLINK(vp, uiop, cred); free(iv, M_TEMP); if (error) { m_freem(mp3); *lenp = 0; goto out; } if (uiop->uio_resid > 0) { len -= uiop->uio_resid; tlen = NFSM_RNDUP(len); if (tlen == 0) { m_freem(mp3); mp3 = mp = NULL; } else if (tlen != NFS_MAXPATHLEN || tlen != len) mp = nfsrv_adj(mp3, NFS_MAXPATHLEN - tlen, tlen - len); } *lenp = len; *mpp = mp3; *mpendp = mp; out: NFSEXITCODE(error); return (error); } /* * Create an mbuf chain and an associated iovec that can be used to Read * or Getextattr of data. * Upon success, return pointers to the first and last mbufs in the chain * plus the malloc'd iovec and its iovlen. */ static int nfsrv_createiovec(int len, struct mbuf **mpp, struct mbuf **mpendp, struct iovec **ivp) { struct mbuf *m, *m2 = NULL, *m3; struct iovec *iv; int i, left, siz; left = len; m3 = NULL; /* * Generate the mbuf list with the uio_iov ref. to it. */ i = 0; while (left > 0) { NFSMGET(m); MCLGET(m, M_WAITOK); m->m_len = 0; siz = min(M_TRAILINGSPACE(m), left); left -= siz; i++; if (m3) m2->m_next = m; else m3 = m; m2 = m; } *ivp = iv = malloc(i * sizeof (struct iovec), M_TEMP, M_WAITOK); m = m3; left = len; i = 0; while (left > 0) { if (m == NULL) panic("nfsrv_createiovec iov"); siz = min(M_TRAILINGSPACE(m), left); if (siz > 0) { iv->iov_base = mtod(m, caddr_t) + m->m_len; iv->iov_len = siz; m->m_len += siz; left -= siz; iv++; i++; } m = m->m_next; } *mpp = m3; *mpendp = m2; return (i); } /* * Create an mbuf chain and an associated iovec that can be used to Read * or Getextattr of data. * Upon success, return pointers to the first and last mbufs in the chain * plus the malloc'd iovec and its iovlen. * Same as above, but creates ext_pgs mbuf(s). */ static int nfsrv_createiovec_extpgs(int len, int maxextsiz, struct mbuf **mpp, struct mbuf **mpendp, struct iovec **ivp) { struct mbuf *m, *m2 = NULL, *m3; struct iovec *iv; int i, left, pgno, siz; left = len; m3 = NULL; /* * Generate the mbuf list with the uio_iov ref. to it. */ i = 0; while (left > 0) { siz = min(left, maxextsiz); m = mb_alloc_ext_plus_pages(siz, M_WAITOK); left -= siz; i += m->m_epg_npgs; if (m3 != NULL) m2->m_next = m; else m3 = m; m2 = m; } *ivp = iv = malloc(i * sizeof (struct iovec), M_TEMP, M_WAITOK); m = m3; left = len; i = 0; pgno = 0; while (left > 0) { if (m == NULL) panic("nfsvno_createiovec_extpgs iov"); siz = min(PAGE_SIZE, left); if (siz > 0) { iv->iov_base = (void *)PHYS_TO_DMAP(m->m_epg_pa[pgno]); iv->iov_len = siz; m->m_len += siz; if (pgno == m->m_epg_npgs - 1) m->m_epg_last_len = siz; left -= siz; iv++; i++; pgno++; } if (pgno == m->m_epg_npgs && left > 0) { m = m->m_next; if (m == NULL) panic("nfsvno_createiovec_extpgs iov"); pgno = 0; } } *mpp = m3; *mpendp = m2; return (i); } /* * Read vnode op call into mbuf list. */ int nfsvno_read(struct vnode *vp, off_t off, int cnt, struct ucred *cred, int maxextsiz, struct thread *p, struct mbuf **mpp, struct mbuf **mpendp) { struct mbuf *m; struct iovec *iv; int error = 0, len, tlen, ioflag = 0; struct mbuf *m3; struct uio io, *uiop = &io; struct nfsheur *nh; /* * Attempt to read from a DS file. A return of ENOENT implies * there is no DS file to read. */ error = nfsrv_proxyds(vp, off, cnt, cred, p, NFSPROC_READDS, mpp, NULL, mpendp, NULL, NULL, NULL, 0, NULL); if (error != ENOENT) return (error); len = NFSM_RNDUP(cnt); if (maxextsiz > 0) uiop->uio_iovcnt = nfsrv_createiovec_extpgs(len, maxextsiz, &m3, &m, &iv); else uiop->uio_iovcnt = nfsrv_createiovec(len, &m3, &m, &iv); uiop->uio_iov = iv; uiop->uio_offset = off; uiop->uio_resid = len; uiop->uio_rw = UIO_READ; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = NULL; nh = nfsrv_sequential_heuristic(uiop, vp); ioflag |= nh->nh_seqcount << IO_SEQSHIFT; /* XXX KDM make this more systematic? */ nfsstatsv1.srvbytes[NFSV4OP_READ] += uiop->uio_resid; error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred); free(iv, M_TEMP); if (error) { m_freem(m3); *mpp = NULL; goto out; } nh->nh_nextoff = uiop->uio_offset; tlen = len - uiop->uio_resid; cnt = cnt < tlen ? cnt : tlen; tlen = NFSM_RNDUP(cnt); if (tlen == 0) { m_freem(m3); m3 = m = NULL; } else if (len != tlen || tlen != cnt) m = nfsrv_adj(m3, len - tlen, tlen - cnt); *mpp = m3; *mpendp = m; out: NFSEXITCODE(error); return (error); } /* * Create the iovec for the mbuf chain passed in as an argument. * The "cp" argument is where the data starts within the first mbuf in * the chain. It returns the iovec and the iovcnt. */ static int nfsrv_createiovecw(int retlen, struct mbuf *m, char *cp, struct iovec **ivpp, int *iovcntp) { struct mbuf *mp; struct iovec *ivp; int cnt, i, len; /* * Loop through the mbuf chain, counting how many mbufs are a * part of this write operation, so the iovec size is known. */ cnt = 0; len = retlen; mp = m; i = mtod(mp, caddr_t) + mp->m_len - cp; while (len > 0) { if (i > 0) { len -= i; cnt++; } mp = mp->m_next; if (!mp) { if (len > 0) return (EBADRPC); } else i = mp->m_len; } /* Now, create the iovec. */ mp = m; *ivpp = ivp = malloc(cnt * sizeof (struct iovec), M_TEMP, M_WAITOK); *iovcntp = cnt; i = mtod(mp, caddr_t) + mp->m_len - cp; len = retlen; while (len > 0) { if (mp == NULL) panic("nfsrv_createiovecw"); if (i > 0) { i = min(i, len); ivp->iov_base = cp; ivp->iov_len = i; ivp++; len -= i; } mp = mp->m_next; if (mp) { i = mp->m_len; cp = mtod(mp, caddr_t); } } return (0); } /* * Write vnode op from an mbuf list. */ int nfsvno_write(struct vnode *vp, off_t off, int retlen, int *stable, struct mbuf *mp, char *cp, struct ucred *cred, struct thread *p) { struct iovec *iv; int cnt, ioflags, error; struct uio io, *uiop = &io; struct nfsheur *nh; /* * Attempt to write to a DS file. A return of ENOENT implies * there is no DS file to write. */ error = nfsrv_proxyds(vp, off, retlen, cred, p, NFSPROC_WRITEDS, &mp, cp, NULL, NULL, NULL, NULL, 0, NULL); if (error != ENOENT) { *stable = NFSWRITE_FILESYNC; return (error); } if (*stable == NFSWRITE_UNSTABLE) ioflags = IO_NODELOCKED; else ioflags = (IO_SYNC | IO_NODELOCKED); error = nfsrv_createiovecw(retlen, mp, cp, &iv, &cnt); if (error != 0) return (error); uiop->uio_iov = iv; uiop->uio_iovcnt = cnt; uiop->uio_resid = retlen; uiop->uio_rw = UIO_WRITE; uiop->uio_segflg = UIO_SYSSPACE; NFSUIOPROC(uiop, p); uiop->uio_offset = off; nh = nfsrv_sequential_heuristic(uiop, vp); ioflags |= nh->nh_seqcount << IO_SEQSHIFT; /* XXX KDM make this more systematic? */ nfsstatsv1.srvbytes[NFSV4OP_WRITE] += uiop->uio_resid; error = VOP_WRITE(vp, uiop, ioflags, cred); if (error == 0) nh->nh_nextoff = uiop->uio_offset; free(iv, M_TEMP); NFSEXITCODE(error); return (error); } /* * Common code for creating a regular file (plus special files for V2). */ int nfsvno_createsub(struct nfsrv_descript *nd, struct nameidata *ndp, struct vnode **vpp, struct nfsvattr *nvap, int *exclusive_flagp, int32_t *cverf, NFSDEV_T rdev, struct nfsexstuff *exp) { u_quad_t tempsize; int error; struct thread *p = curthread; error = nd->nd_repstat; if (!error && ndp->ni_vp == NULL) { if (nvap->na_type == VREG || nvap->na_type == VSOCK) { vrele(ndp->ni_startdir); error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); /* For a pNFS server, create the data file on a DS. */ if (error == 0 && nvap->na_type == VREG) { /* * Create a data file on a DS for a pNFS server. * This function just returns if not * running a pNFS DS or the creation fails. */ nfsrv_pnfscreate(ndp->ni_vp, &nvap->na_vattr, nd->nd_cred, p); } VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &ndp->ni_vp : NULL, false); nfsvno_relpathbuf(ndp); if (!error) { if (*exclusive_flagp) { *exclusive_flagp = 0; NFSVNO_ATTRINIT(nvap); nvap->na_atime.tv_sec = cverf[0]; nvap->na_atime.tv_nsec = cverf[1]; error = VOP_SETATTR(ndp->ni_vp, &nvap->na_vattr, nd->nd_cred); if (error != 0) { vput(ndp->ni_vp); ndp->ni_vp = NULL; error = NFSERR_NOTSUPP; } } } /* * NFS V2 Only. nfsrvd_mknod() does this for V3. * (This implies, just get out on an error.) */ } else if (nvap->na_type == VCHR || nvap->na_type == VBLK || nvap->na_type == VFIFO) { if (nvap->na_type == VCHR && rdev == 0xffffffff) nvap->na_type = VFIFO; if (nvap->na_type != VFIFO && (error = priv_check_cred(nd->nd_cred, PRIV_VFS_MKNOD_DEV))) { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vput(ndp->ni_dvp); goto out; } nvap->na_rdev = rdev; error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &ndp->ni_vp : NULL, false); nfsvno_relpathbuf(ndp); vrele(ndp->ni_startdir); if (error) goto out; } else { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vput(ndp->ni_dvp); error = ENXIO; goto out; } *vpp = ndp->ni_vp; } else { /* * Handle cases where error is already set and/or * the file exists. * 1 - clean up the lookup * 2 - iff !error and na_size set, truncate it */ vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); *vpp = ndp->ni_vp; if (ndp->ni_dvp == *vpp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); if (!error && nvap->na_size != VNOVAL) { error = nfsvno_accchk(*vpp, VWRITE, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, NULL); if (!error) { tempsize = nvap->na_size; NFSVNO_ATTRINIT(nvap); nvap->na_size = tempsize; error = nfsvno_setattr(*vpp, nvap, nd->nd_cred, p, exp); } } if (error) vput(*vpp); } out: NFSEXITCODE(error); return (error); } /* * Do a mknod vnode op. */ int nfsvno_mknod(struct nameidata *ndp, struct nfsvattr *nvap, struct ucred *cred, struct thread *p) { int error = 0; enum vtype vtyp; vtyp = nvap->na_type; /* * Iff doesn't exist, create it. */ if (ndp->ni_vp) { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vput(ndp->ni_dvp); vrele(ndp->ni_vp); error = EEXIST; goto out; } if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vput(ndp->ni_dvp); error = NFSERR_BADTYPE; goto out; } if (vtyp == VSOCK) { vrele(ndp->ni_startdir); error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &ndp->ni_vp : NULL, false); nfsvno_relpathbuf(ndp); } else { if (nvap->na_type != VFIFO && (error = priv_check_cred(cred, PRIV_VFS_MKNOD_DEV))) { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vput(ndp->ni_dvp); goto out; } error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &ndp->ni_vp : NULL, false); nfsvno_relpathbuf(ndp); vrele(ndp->ni_startdir); /* * Since VOP_MKNOD returns the ni_vp, I can't * see any reason to do the lookup. */ } out: NFSEXITCODE(error); return (error); } /* * Mkdir vnode op. */ int nfsvno_mkdir(struct nameidata *ndp, struct nfsvattr *nvap, uid_t saved_uid, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { int error = 0; if (ndp->ni_vp != NULL) { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); vrele(ndp->ni_vp); nfsvno_relpathbuf(ndp); error = EEXIST; goto out; } error = VOP_MKDIR(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &ndp->ni_vp : NULL, false); nfsvno_relpathbuf(ndp); out: NFSEXITCODE(error); return (error); } /* * symlink vnode op. */ int nfsvno_symlink(struct nameidata *ndp, struct nfsvattr *nvap, char *pathcp, int pathlen, int not_v2, uid_t saved_uid, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { int error = 0; if (ndp->ni_vp) { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); vrele(ndp->ni_vp); error = EEXIST; goto out; } error = VOP_SYMLINK(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr, pathcp); /* * Although FreeBSD still had the lookup code in * it for 7/current, there doesn't seem to be any * point, since VOP_SYMLINK() returns the ni_vp. * Just vput it for v2. */ VOP_VPUT_PAIR(ndp->ni_dvp, &ndp->ni_vp, !not_v2 && error == 0); vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); out: NFSEXITCODE(error); return (error); } /* * Parse symbolic link arguments. * This function has an ugly side effect. It will malloc() an area for * the symlink and set iov_base to point to it, only if it succeeds. * So, if it returns with uiop->uio_iov->iov_base != NULL, that must * be FREE'd later. */ int nfsvno_getsymlink(struct nfsrv_descript *nd, struct nfsvattr *nvap, struct thread *p, char **pathcpp, int *lenp) { u_int32_t *tl; char *pathcp = NULL; int error = 0, len; struct nfsv2_sattr *sp; *pathcpp = NULL; *lenp = 0; if ((nd->nd_flag & ND_NFSV3) && (error = nfsrv_sattr(nd, NULL, nvap, NULL, NULL, p))) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len > NFS_MAXPATHLEN || len <= 0) { error = EBADRPC; goto nfsmout; } pathcp = malloc(len + 1, M_TEMP, M_WAITOK); error = nfsrv_mtostr(nd, pathcp, len); if (error) goto nfsmout; if (nd->nd_flag & ND_NFSV2) { NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR); nvap->na_mode = fxdr_unsigned(u_int16_t, sp->sa_mode); } *pathcpp = pathcp; *lenp = len; NFSEXITCODE2(0, nd); return (0); nfsmout: if (pathcp) free(pathcp, M_TEMP); NFSEXITCODE2(error, nd); return (error); } /* * Remove a non-directory object. */ int nfsvno_removesub(struct nameidata *ndp, int is_v4, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { struct vnode *vp, *dsdvp[NFSDEV_MAXMIRRORS]; int error = 0, mirrorcnt; char fname[PNFS_FILENAME_LEN + 1]; fhandle_t fh; vp = ndp->ni_vp; dsdvp[0] = NULL; if (vp->v_type == VDIR) error = NFSERR_ISDIR; else if (is_v4) error = nfsrv_checkremove(vp, 1, NULL, (nfsquad_t)((u_quad_t)0), p); if (error == 0) nfsrv_pnfsremovesetup(vp, p, dsdvp, &mirrorcnt, fname, &fh); if (!error) error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd); if (error == 0 && dsdvp[0] != NULL) nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p); if (ndp->ni_dvp == vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); vput(vp); - if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0) - nfsvno_relpathbuf(ndp); + nfsvno_relpathbuf(ndp); NFSEXITCODE(error); return (error); } /* * Remove a directory. */ int nfsvno_rmdirsub(struct nameidata *ndp, int is_v4, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { struct vnode *vp; int error = 0; vp = ndp->ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (ndp->ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) error = EBUSY; out: if (!error) error = VOP_RMDIR(ndp->ni_dvp, vp, &ndp->ni_cnd); if (ndp->ni_dvp == vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); vput(vp); - if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0) - nfsvno_relpathbuf(ndp); + nfsvno_relpathbuf(ndp); NFSEXITCODE(error); return (error); } /* * Rename vnode op. */ int nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, u_int32_t ndstat, u_int32_t ndflag, struct ucred *cred, struct thread *p) { struct vnode *fvp, *tvp, *tdvp, *dsdvp[NFSDEV_MAXMIRRORS]; int error = 0, mirrorcnt; char fname[PNFS_FILENAME_LEN + 1]; fhandle_t fh; dsdvp[0] = NULL; fvp = fromndp->ni_vp; if (ndstat) { vrele(fromndp->ni_dvp); vrele(fvp); error = ndstat; goto out1; } tdvp = tondp->ni_dvp; tvp = tondp->ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = (ndflag & ND_NFSV2) ? EISDIR : EEXIST; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = (ndflag & ND_NFSV2) ? ENOTDIR : EEXIST; goto out; } if (tvp->v_type == VDIR && tvp->v_mountedhere) { error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV; goto out; } /* * A rename to '.' or '..' results in a prematurely * unlocked vnode on FreeBSD5, so I'm just going to fail that * here. */ if ((tondp->ni_cnd.cn_namelen == 1 && tondp->ni_cnd.cn_nameptr[0] == '.') || (tondp->ni_cnd.cn_namelen == 2 && tondp->ni_cnd.cn_nameptr[0] == '.' && tondp->ni_cnd.cn_nameptr[1] == '.')) { error = EINVAL; goto out; } } if (fvp->v_type == VDIR && fvp->v_mountedhere) { error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV; goto out; } if (fvp->v_mount != tdvp->v_mount) { error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV; goto out; } if (fvp == tdvp) { error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EINVAL; goto out; } if (fvp == tvp) { /* * If source and destination are the same, there is nothing to * do. Set error to -1 to indicate this. */ error = -1; goto out; } if (ndflag & ND_NFSV4) { if (NFSVOPLOCK(fvp, LK_EXCLUSIVE) == 0) { error = nfsrv_checkremove(fvp, 0, NULL, (nfsquad_t)((u_quad_t)0), p); NFSVOPUNLOCK(fvp); } else error = EPERM; if (tvp && !error) error = nfsrv_checkremove(tvp, 1, NULL, (nfsquad_t)((u_quad_t)0), p); } else { /* * For NFSv2 and NFSv3, try to get rid of the delegation, so * that the NFSv4 client won't be confused by the rename. * Since nfsd_recalldelegation() can only be called on an * unlocked vnode at this point and fvp is the file that will * still exist after the rename, just do fvp. */ nfsd_recalldelegation(fvp, p); } if (error == 0 && tvp != NULL) { nfsrv_pnfsremovesetup(tvp, p, dsdvp, &mirrorcnt, fname, &fh); NFSD_DEBUG(4, "nfsvno_rename: pnfsremovesetup" " dsdvp=%p\n", dsdvp[0]); } out: if (!error) { error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp, &fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp, &tondp->ni_cnd); } else { if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fromndp->ni_dvp); vrele(fvp); if (error == -1) error = 0; } /* * If dsdvp[0] != NULL, it was set up by nfsrv_pnfsremovesetup() and * if the rename succeeded, the DS file for the tvp needs to be * removed. */ if (error == 0 && dsdvp[0] != NULL) { nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p); NFSD_DEBUG(4, "nfsvno_rename: pnfsremove\n"); } vrele(tondp->ni_startdir); nfsvno_relpathbuf(tondp); out1: vrele(fromndp->ni_startdir); nfsvno_relpathbuf(fromndp); NFSEXITCODE(error); return (error); } /* * Link vnode op. */ int nfsvno_link(struct nameidata *ndp, struct vnode *vp, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { struct vnode *xp; int error = 0; xp = ndp->ni_vp; if (xp != NULL) { error = EEXIST; } else { xp = ndp->ni_dvp; if (vp->v_mount != xp->v_mount) error = EXDEV; } if (!error) { NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY); if (!VN_IS_DOOMED(vp)) error = VOP_LINK(ndp->ni_dvp, vp, &ndp->ni_cnd); else error = EPERM; if (ndp->ni_dvp == vp) { vrele(ndp->ni_dvp); NFSVOPUNLOCK(vp); } else { vref(vp); VOP_VPUT_PAIR(ndp->ni_dvp, &vp, true); } } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); if (ndp->ni_vp) vrele(ndp->ni_vp); } nfsvno_relpathbuf(ndp); NFSEXITCODE(error); return (error); } /* * Do the fsync() appropriate for the commit. */ int nfsvno_fsync(struct vnode *vp, u_int64_t off, int cnt, struct ucred *cred, struct thread *td) { int error = 0; /* * RFC 1813 3.3.21: if count is 0, a flush from offset to the end of * file is done. At this time VOP_FSYNC does not accept offset and * byte count parameters so call VOP_FSYNC the whole file for now. * The same is true for NFSv4: RFC 3530 Sec. 14.2.3. * File systems that do not use the buffer cache (as indicated * by MNTK_USES_BCACHE not being set) must use VOP_FSYNC(). */ if (cnt == 0 || cnt > MAX_COMMIT_COUNT || (vp->v_mount->mnt_kern_flag & MNTK_USES_BCACHE) == 0) { /* * Give up and do the whole thing */ if (vp->v_object && vm_object_mightbedirty(vp->v_object)) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); } else { /* * Locate and synchronously write any buffers that fall * into the requested range. Note: we are assuming that * f_iosize is a power of 2. */ int iosize = vp->v_mount->mnt_stat.f_iosize; int iomask = iosize - 1; struct bufobj *bo; daddr_t lblkno; /* * Align to iosize boundary, super-align to page boundary. */ if (off & iomask) { cnt += off & iomask; off &= ~(u_quad_t)iomask; } if (off & PAGE_MASK) { cnt += off & PAGE_MASK; off &= ~(u_quad_t)PAGE_MASK; } lblkno = off / iosize; if (vp->v_object && vm_object_mightbedirty(vp->v_object)) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, off, off + cnt, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_object); } bo = &vp->v_bufobj; BO_LOCK(bo); while (cnt > 0) { struct buf *bp; /* * If we have a buffer and it is marked B_DELWRI we * have to lock and write it. Otherwise the prior * write is assumed to have already been committed. * * gbincore() can return invalid buffers now so we * have to check that bit as well (though B_DELWRI * should not be set if B_INVAL is set there could be * a race here since we haven't locked the buffer). */ if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); continue; /* retry */ } if ((bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI) { bremfree(bp); bp->b_flags &= ~B_ASYNC; bwrite(bp); ++nfs_commit_miss; } else BUF_UNLOCK(bp); BO_LOCK(bo); } ++nfs_commit_blks; if (cnt < iosize) break; cnt -= iosize; ++lblkno; } BO_UNLOCK(bo); } NFSEXITCODE(error); return (error); } /* * Statfs vnode op. */ int nfsvno_statfs(struct vnode *vp, struct statfs *sf) { struct statfs *tsf; int error; tsf = NULL; if (nfsrv_devidcnt > 0) { /* For a pNFS service, get the DS numbers. */ tsf = malloc(sizeof(*tsf), M_TEMP, M_WAITOK | M_ZERO); error = nfsrv_pnfsstatfs(tsf, vp->v_mount); if (error != 0) { free(tsf, M_TEMP); tsf = NULL; } } error = VFS_STATFS(vp->v_mount, sf); if (error == 0) { if (tsf != NULL) { sf->f_blocks = tsf->f_blocks; sf->f_bavail = tsf->f_bavail; sf->f_bfree = tsf->f_bfree; sf->f_bsize = tsf->f_bsize; } /* * Since NFS handles these values as unsigned on the * wire, there is no way to represent negative values, * so set them to 0. Without this, they will appear * to be very large positive values for clients like * Solaris10. */ if (sf->f_bavail < 0) sf->f_bavail = 0; if (sf->f_ffree < 0) sf->f_ffree = 0; } free(tsf, M_TEMP); NFSEXITCODE(error); return (error); } /* * Do the vnode op stuff for Open. Similar to nfsvno_createsub(), but * must handle nfsrv_opencheck() calls after any other access checks. */ void nfsvno_open(struct nfsrv_descript *nd, struct nameidata *ndp, nfsquad_t clientid, nfsv4stateid_t *stateidp, struct nfsstate *stp, int *exclusive_flagp, struct nfsvattr *nvap, int32_t *cverf, int create, NFSACL_T *aclp, nfsattrbit_t *attrbitp, struct ucred *cred, struct nfsexstuff *exp, struct vnode **vpp) { struct vnode *vp = NULL; u_quad_t tempsize; struct nfsexstuff nes; struct thread *p = curthread; if (ndp->ni_vp == NULL) nd->nd_repstat = nfsrv_opencheck(clientid, stateidp, stp, NULL, nd, p, nd->nd_repstat); if (!nd->nd_repstat) { if (ndp->ni_vp == NULL) { vrele(ndp->ni_startdir); nd->nd_repstat = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); /* For a pNFS server, create the data file on a DS. */ if (nd->nd_repstat == 0) { /* * Create a data file on a DS for a pNFS server. * This function just returns if not * running a pNFS DS or the creation fails. */ nfsrv_pnfscreate(ndp->ni_vp, &nvap->na_vattr, cred, p); } VOP_VPUT_PAIR(ndp->ni_dvp, nd->nd_repstat == 0 ? &ndp->ni_vp : NULL, false); nfsvno_relpathbuf(ndp); if (!nd->nd_repstat) { if (*exclusive_flagp) { *exclusive_flagp = 0; NFSVNO_ATTRINIT(nvap); nvap->na_atime.tv_sec = cverf[0]; nvap->na_atime.tv_nsec = cverf[1]; nd->nd_repstat = VOP_SETATTR(ndp->ni_vp, &nvap->na_vattr, cred); if (nd->nd_repstat != 0) { vput(ndp->ni_vp); ndp->ni_vp = NULL; nd->nd_repstat = NFSERR_NOTSUPP; } else NFSSETBIT_ATTRBIT(attrbitp, NFSATTRBIT_TIMEACCESS); } else { nfsrv_fixattr(nd, ndp->ni_vp, nvap, aclp, p, attrbitp, exp); } } vp = ndp->ni_vp; } else { if (ndp->ni_startdir) vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vp = ndp->ni_vp; if (create == NFSV4OPEN_CREATE) { if (ndp->ni_dvp == vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); } if (NFSVNO_ISSETSIZE(nvap) && vp->v_type == VREG) { if (ndp->ni_cnd.cn_flags & RDONLY) NFSVNO_SETEXRDONLY(&nes); else NFSVNO_EXINIT(&nes); nd->nd_repstat = nfsvno_accchk(vp, VWRITE, cred, &nes, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, NULL); nd->nd_repstat = nfsrv_opencheck(clientid, stateidp, stp, vp, nd, p, nd->nd_repstat); if (!nd->nd_repstat) { tempsize = nvap->na_size; NFSVNO_ATTRINIT(nvap); nvap->na_size = tempsize; nd->nd_repstat = nfsvno_setattr(vp, nvap, cred, p, exp); } } else if (vp->v_type == VREG) { nd->nd_repstat = nfsrv_opencheck(clientid, stateidp, stp, vp, nd, p, nd->nd_repstat); } } } else { - if (ndp->ni_cnd.cn_flags & HASBUF) - nfsvno_relpathbuf(ndp); + nfsvno_relpathbuf(ndp); if (ndp->ni_startdir && create == NFSV4OPEN_CREATE) { vrele(ndp->ni_startdir); if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); if (ndp->ni_vp) vput(ndp->ni_vp); } } *vpp = vp; NFSEXITCODE2(0, nd); } /* * Updates the file rev and sets the mtime and ctime * to the current clock time, returning the va_filerev and va_Xtime * values. * Return ESTALE to indicate the vnode is VIRF_DOOMED. */ int nfsvno_updfilerev(struct vnode *vp, struct nfsvattr *nvap, struct nfsrv_descript *nd, struct thread *p) { struct vattr va; VATTR_NULL(&va); vfs_timestamp(&va.va_mtime); if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(vp)) return (ESTALE); } (void) VOP_SETATTR(vp, &va, nd->nd_cred); (void) nfsvno_getattr(vp, nvap, nd, p, 1, NULL); return (0); } /* * Glue routine to nfsv4_fillattr(). */ int nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp, struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp, struct ucred *cred, struct thread *p, int isdgram, int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno) { struct statfs *sf; int error; sf = NULL; if (nfsrv_devidcnt > 0 && (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACEAVAIL) || NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACEFREE) || NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACETOTAL))) { sf = malloc(sizeof(*sf), M_TEMP, M_WAITOK | M_ZERO); error = nfsrv_pnfsstatfs(sf, mp); if (error != 0) { free(sf, M_TEMP); sf = NULL; } } error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror, attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root, mounted_on_fileno, sf); free(sf, M_TEMP); NFSEXITCODE2(0, nd); return (error); } /* Since the Readdir vnode ops vary, put the entire functions in here. */ /* * nfs readdir service * - mallocs what it thinks is enough to read * count rounded up to a multiple of DIRBLKSIZ <= NFS_MAXREADDIR * - calls VOP_READDIR() * - loops around building the reply * if the output generated exceeds count break out of loop * The NFSM_CLGET macro is used here so that the reply will be packed * tightly in mbuf clusters. * - it trims out records with d_fileno == 0 * this doesn't matter for Unix clients, but they might confuse clients * for other os'. * - it trims out records with d_type == DT_WHT * these cannot be seen through NFS (unless we extend the protocol) * The alternate call nfsrvd_readdirplus() does lookups as well. * PS: The NFS protocol spec. does not clarify what the "count" byte * argument is a count of.. just name strings and file id's or the * entire reply rpc or ... * I tried just file name and id sizes and it confused the Sun client, * so I am using the full rpc size now. The "paranoia.." comment refers * to including the status longwords that are not a part of the dir. * "entry" structures, but are in the rpc. */ int nfsrvd_readdir(struct nfsrv_descript *nd, int isdgram, struct vnode *vp, struct nfsexstuff *exp) { struct dirent *dp; u_int32_t *tl; int dirlen; char *cpos, *cend, *rbuf; struct nfsvattr at; int nlen, error = 0, getret = 1; int siz, cnt, fullsiz, eofflag, ncookies; u_int64_t off, toff, verf __unused; uint64_t *cookies = NULL, *cookiep; struct uio io; struct iovec iv; int is_ufs; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); goto out; } if (nd->nd_flag & ND_NFSV2) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); off = fxdr_unsigned(u_quad_t, *tl++); } else { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); off = fxdr_hyper(tl); tl += 2; verf = fxdr_hyper(tl); tl += 2; } toff = off; cnt = fxdr_unsigned(int, *tl); if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0) cnt = NFS_SRVMAXDATA(nd); siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1)); fullsiz = siz; if (nd->nd_flag & ND_NFSV3) { nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL); #if 0 /* * va_filerev is not sufficient as a cookie verifier, * since it is not supposed to change when entries are * removed/added unless that offset cookies returned to * the client are no longer valid. */ if (!nd->nd_repstat && toff && verf != at.na_filerev) nd->nd_repstat = NFSERR_BAD_COOKIE; #endif } if (!nd->nd_repstat && vp->v_type != VDIR) nd->nd_repstat = NFSERR_NOTDIR; if (nd->nd_repstat == 0 && cnt == 0) { if (nd->nd_flag & ND_NFSV2) /* NFSv2 does not have NFSERR_TOOSMALL */ nd->nd_repstat = EPERM; else nd->nd_repstat = NFSERR_TOOSMALL; } if (!nd->nd_repstat) nd->nd_repstat = nfsvno_accchk(vp, VEXEC, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat) { vput(vp); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); goto out; } is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0; rbuf = malloc(siz, M_TEMP, M_WAITOK); again: eofflag = 0; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } iv.iov_base = rbuf; iv.iov_len = siz; io.uio_iov = &iv; io.uio_iovcnt = 1; io.uio_offset = (off_t)off; io.uio_resid = siz; io.uio_segflg = UIO_SYSSPACE; io.uio_rw = UIO_READ; io.uio_td = NULL; nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies, &cookies); off = (u_int64_t)io.uio_offset; if (io.uio_resid) siz -= io.uio_resid; if (!cookies && !nd->nd_repstat) nd->nd_repstat = NFSERR_PERM; if (nd->nd_flag & ND_NFSV3) { getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL); if (!nd->nd_repstat) nd->nd_repstat = getret; } /* * Handles the failed cases. nd->nd_repstat == 0 past here. */ if (nd->nd_repstat) { vput(vp); free(rbuf, M_TEMP); if (cookies) free(cookies, M_TEMP); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); goto out; } /* * If nothing read, return eof * rpc reply */ if (siz == 0) { vput(vp); if (nd->nd_flag & ND_NFSV2) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); } else { nfsrv_postopattr(nd, getret, &at); NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED); txdr_hyper(at.na_filerev, tl); tl += 2; } *tl++ = newnfs_false; *tl = newnfs_true; free(rbuf, M_TEMP); free(cookies, M_TEMP); goto out; } /* * Check for degenerate cases of nothing useful read. * If so go try again */ cpos = rbuf; cend = rbuf + siz; dp = (struct dirent *)cpos; cookiep = cookies; /* * For some reason FreeBSD's ufs_readdir() chooses to back the * directory offset up to a block boundary, so it is necessary to * skip over the records that precede the requested offset. This * requires the assumption that file offset cookies monotonically * increase. */ while (cpos < cend && ncookies > 0 && (dp->d_fileno == 0 || dp->d_type == DT_WHT || (is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff))) { cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } if (cpos >= cend || ncookies == 0) { siz = fullsiz; toff = off; goto again; } vput(vp); /* * If cnt > MCLBYTES and the reply will not be saved, use * ext_pgs mbufs for TLS. * For NFSv4.0, we do not know for sure if the reply will * be saved, so do not use ext_pgs mbufs for NFSv4.0. */ if (cnt > MCLBYTES && siz > MCLBYTES && (nd->nd_flag & (ND_TLS | ND_EXTPG | ND_SAVEREPLY)) == ND_TLS && (nd->nd_flag & (ND_NFSV4 | ND_NFSV41)) != ND_NFSV4) nd->nd_flag |= ND_EXTPG; /* * dirlen is the size of the reply, including all XDR and must * not exceed cnt. For NFSv2, RFC1094 didn't clearly indicate * if the XDR should be included in "count", but to be safe, we do. * (Include the two booleans at the end of the reply in dirlen now.) */ if (nd->nd_flag & ND_NFSV3) { nfsrv_postopattr(nd, getret, &at); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); txdr_hyper(at.na_filerev, tl); dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED; } else { dirlen = 2 * NFSX_UNSIGNED; } /* Loop through the records and build reply */ while (cpos < cend && ncookies > 0) { nlen = dp->d_namlen; if (dp->d_fileno != 0 && dp->d_type != DT_WHT && nlen <= NFS_MAXNAMLEN) { if (nd->nd_flag & ND_NFSV3) dirlen += (6*NFSX_UNSIGNED + NFSM_RNDUP(nlen)); else dirlen += (4*NFSX_UNSIGNED + NFSM_RNDUP(nlen)); if (dirlen > cnt) { eofflag = 0; break; } /* * Build the directory record xdr from * the dirent entry. */ if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = newnfs_true; *tl++ = 0; } else { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = newnfs_true; } *tl = txdr_unsigned(dp->d_fileno); (void) nfsm_strtom(nd, dp->d_name, nlen); if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); txdr_hyper(*cookiep, tl); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(*cookiep); } } cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } if (cpos < cend) eofflag = 0; NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = newnfs_false; if (eofflag) *tl = newnfs_true; else *tl = newnfs_false; free(rbuf, M_TEMP); free(cookies, M_TEMP); out: NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * Readdirplus for V3 and Readdir for V4. */ int nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram, struct vnode *vp, struct nfsexstuff *exp) { struct dirent *dp; u_int32_t *tl; int dirlen; char *cpos, *cend, *rbuf; struct vnode *nvp; fhandle_t nfh; struct nfsvattr nva, at, *nvap = &nva; struct mbuf *mb0, *mb1; struct nfsreferral *refp; int nlen, r, error = 0, getret = 1, usevget = 1; int siz, cnt, fullsiz, eofflag, ncookies, entrycnt; caddr_t bpos0, bpos1; u_int64_t off, toff, verf __unused; uint64_t *cookies = NULL, *cookiep; nfsattrbit_t attrbits, rderrbits, savbits; struct uio io; struct iovec iv; struct componentname cn; int at_root, is_ufs, is_zfs, needs_unbusy, supports_nfsv4acls; struct mount *mp, *new_mp; uint64_t mounted_on_fileno; struct thread *p = curthread; int bextpg0, bextpg1, bextpgsiz0, bextpgsiz1; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); goto out; } NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED); off = fxdr_hyper(tl); toff = off; tl += 2; verf = fxdr_hyper(tl); tl += 2; siz = fxdr_unsigned(int, *tl++); cnt = fxdr_unsigned(int, *tl); /* * Use the server's maximum data transfer size as the upper bound * on reply datalen. */ if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0) cnt = NFS_SRVMAXDATA(nd); /* * siz is a "hint" of how much directory information (name, fileid, * cookie) should be in the reply. At least one client "hints" 0, * so I set it to cnt for that case. I also round it up to the * next multiple of DIRBLKSIZ. * Since the size of a Readdirplus directory entry reply will always * be greater than a directory entry returned by VOP_READDIR(), it * does not make sense to read more than NFS_SRVMAXDATA() via * VOP_READDIR(). */ if (siz <= 0) siz = cnt; else if (siz > NFS_SRVMAXDATA(nd)) siz = NFS_SRVMAXDATA(nd); siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1)); if (nd->nd_flag & ND_NFSV4) { error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error) goto nfsmout; NFSSET_ATTRBIT(&savbits, &attrbits); NFSCLRNOTFILLABLE_ATTRBIT(&attrbits, nd); NFSZERO_ATTRBIT(&rderrbits); NFSSETBIT_ATTRBIT(&rderrbits, NFSATTRBIT_RDATTRERROR); } else { NFSZERO_ATTRBIT(&attrbits); } fullsiz = siz; nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL); #if 0 if (!nd->nd_repstat) { if (off && verf != at.na_filerev) { /* * va_filerev is not sufficient as a cookie verifier, * since it is not supposed to change when entries are * removed/added unless that offset cookies returned to * the client are no longer valid. */ if (nd->nd_flag & ND_NFSV4) { nd->nd_repstat = NFSERR_NOTSAME; } else { nd->nd_repstat = NFSERR_BAD_COOKIE; } } } #endif if (!nd->nd_repstat && vp->v_type != VDIR) nd->nd_repstat = NFSERR_NOTDIR; if (!nd->nd_repstat && cnt == 0) nd->nd_repstat = NFSERR_TOOSMALL; if (!nd->nd_repstat) nd->nd_repstat = nfsvno_accchk(vp, VEXEC, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat) { vput(vp); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); goto out; } is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0; is_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs") == 0; rbuf = malloc(siz, M_TEMP, M_WAITOK); again: eofflag = 0; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } iv.iov_base = rbuf; iv.iov_len = siz; io.uio_iov = &iv; io.uio_iovcnt = 1; io.uio_offset = (off_t)off; io.uio_resid = siz; io.uio_segflg = UIO_SYSSPACE; io.uio_rw = UIO_READ; io.uio_td = NULL; nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies, &cookies); off = (u_int64_t)io.uio_offset; if (io.uio_resid) siz -= io.uio_resid; getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL); if (!cookies && !nd->nd_repstat) nd->nd_repstat = NFSERR_PERM; if (!nd->nd_repstat) nd->nd_repstat = getret; if (nd->nd_repstat) { vput(vp); if (cookies) free(cookies, M_TEMP); free(rbuf, M_TEMP); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); goto out; } /* * If nothing read, return eof * rpc reply */ if (siz == 0) { vput(vp); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED); txdr_hyper(at.na_filerev, tl); tl += 2; *tl++ = newnfs_false; *tl = newnfs_true; free(cookies, M_TEMP); free(rbuf, M_TEMP); goto out; } /* * Check for degenerate cases of nothing useful read. * If so go try again */ cpos = rbuf; cend = rbuf + siz; dp = (struct dirent *)cpos; cookiep = cookies; /* * For some reason FreeBSD's ufs_readdir() chooses to back the * directory offset up to a block boundary, so it is necessary to * skip over the records that precede the requested offset. This * requires the assumption that file offset cookies monotonically * increase. */ while (cpos < cend && ncookies > 0 && (dp->d_fileno == 0 || dp->d_type == DT_WHT || (is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff) || ((nd->nd_flag & ND_NFSV4) && ((dp->d_namlen == 1 && dp->d_name[0] == '.') || (dp->d_namlen==2 && dp->d_name[0]=='.' && dp->d_name[1]=='.'))))) { cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } if (cpos >= cend || ncookies == 0) { siz = fullsiz; toff = off; goto again; } /* * Busy the file system so that the mount point won't go away * and, as such, VFS_VGET() can be used safely. */ mp = vp->v_mount; vfs_ref(mp); NFSVOPUNLOCK(vp); nd->nd_repstat = vfs_busy(mp, 0); vfs_rel(mp); if (nd->nd_repstat != 0) { vrele(vp); free(cookies, M_TEMP); free(rbuf, M_TEMP); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); goto out; } /* * Check to see if entries in this directory can be safely acquired * via VFS_VGET() or if a switch to VOP_LOOKUP() is required. * ZFS snapshot directories need VOP_LOOKUP(), so that any * automount of the snapshot directory that is required will * be done. * This needs to be done here for NFSv4, since NFSv4 never does * a VFS_VGET() for "." or "..". */ if (is_zfs == 1) { r = VFS_VGET(mp, at.na_fileid, LK_SHARED, &nvp); if (r == EOPNOTSUPP) { usevget = 0; cn.cn_nameiop = LOOKUP; cn.cn_lkflags = LK_SHARED | LK_RETRY; cn.cn_cred = nd->nd_cred; } else if (r == 0) vput(nvp); } /* * If the reply is likely to exceed MCLBYTES and the reply will * not be saved, use ext_pgs mbufs for TLS. * It is difficult to predict how large each entry will be and * how many entries have been read, so just assume the directory * entries grow by a factor of 4 when attributes are included. * For NFSv4.0, we do not know for sure if the reply will * be saved, so do not use ext_pgs mbufs for NFSv4.0. */ if (cnt > MCLBYTES && siz > MCLBYTES / 4 && (nd->nd_flag & (ND_TLS | ND_EXTPG | ND_SAVEREPLY)) == ND_TLS && (nd->nd_flag & (ND_NFSV4 | ND_NFSV41)) != ND_NFSV4) nd->nd_flag |= ND_EXTPG; /* * Save this position, in case there is an error before one entry * is created. */ mb0 = nd->nd_mb; bpos0 = nd->nd_bpos; bextpg0 = nd->nd_bextpg; bextpgsiz0 = nd->nd_bextpgsiz; /* * Fill in the first part of the reply. * dirlen is the reply length in bytes and cannot exceed cnt. * (Include the two booleans at the end of the reply in dirlen now, * so we recognize when we have exceeded cnt.) */ if (nd->nd_flag & ND_NFSV3) { dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED; nfsrv_postopattr(nd, getret, &at); } else { dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED; } NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); txdr_hyper(at.na_filerev, tl); /* * Save this position, in case there is an empty reply needed. */ mb1 = nd->nd_mb; bpos1 = nd->nd_bpos; bextpg1 = nd->nd_bextpg; bextpgsiz1 = nd->nd_bextpgsiz; /* Loop through the records and build reply */ entrycnt = 0; while (cpos < cend && ncookies > 0 && dirlen < cnt) { nlen = dp->d_namlen; if (dp->d_fileno != 0 && dp->d_type != DT_WHT && nlen <= NFS_MAXNAMLEN && ((nd->nd_flag & ND_NFSV3) || nlen > 2 || (nlen==2 && (dp->d_name[0]!='.' || dp->d_name[1]!='.')) || (nlen == 1 && dp->d_name[0] != '.'))) { /* * Save the current position in the reply, in case * this entry exceeds cnt. */ mb1 = nd->nd_mb; bpos1 = nd->nd_bpos; bextpg1 = nd->nd_bextpg; bextpgsiz1 = nd->nd_bextpgsiz; /* * For readdir_and_lookup get the vnode using * the file number. */ nvp = NULL; refp = NULL; r = 0; at_root = 0; needs_unbusy = 0; new_mp = mp; mounted_on_fileno = (uint64_t)dp->d_fileno; if ((nd->nd_flag & ND_NFSV3) || NFSNONZERO_ATTRBIT(&savbits)) { if (nd->nd_flag & ND_NFSV4) refp = nfsv4root_getreferral(NULL, vp, dp->d_fileno); if (refp == NULL) { if (usevget) r = VFS_VGET(mp, dp->d_fileno, LK_SHARED, &nvp); else r = EOPNOTSUPP; if (r == EOPNOTSUPP) { if (usevget) { usevget = 0; cn.cn_nameiop = LOOKUP; cn.cn_lkflags = LK_SHARED | LK_RETRY; cn.cn_cred = nd->nd_cred; } cn.cn_nameptr = dp->d_name; cn.cn_namelen = nlen; cn.cn_flags = ISLASTCN | NOFOLLOW | LOCKLEAF; if (nlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.') cn.cn_flags |= ISDOTDOT; if (NFSVOPLOCK(vp, LK_SHARED) != 0) { nd->nd_repstat = EPERM; break; } if ((vp->v_vflag & VV_ROOT) != 0 && (cn.cn_flags & ISDOTDOT) != 0) { vref(vp); nvp = vp; r = 0; } else { r = VOP_LOOKUP(vp, &nvp, &cn); if (vp != nvp) NFSVOPUNLOCK(vp); } } /* * For NFSv4, check to see if nvp is * a mount point and get the mount * point vnode, as required. */ if (r == 0 && nfsrv_enable_crossmntpt != 0 && (nd->nd_flag & ND_NFSV4) != 0 && nvp->v_type == VDIR && nvp->v_mountedhere != NULL) { new_mp = nvp->v_mountedhere; r = vfs_busy(new_mp, 0); vput(nvp); nvp = NULL; if (r == 0) { r = VFS_ROOT(new_mp, LK_SHARED, &nvp); needs_unbusy = 1; if (r == 0) at_root = 1; } } } /* * If we failed to look up the entry, then it * has become invalid, most likely removed. */ if (r != 0) { if (needs_unbusy) vfs_unbusy(new_mp); goto invalid; } KASSERT(refp != NULL || nvp != NULL, ("%s: undetected lookup error", __func__)); if (refp == NULL && ((nd->nd_flag & ND_NFSV3) || NFSNONZERO_ATTRBIT(&attrbits))) { r = nfsvno_getfh(nvp, &nfh, p); if (!r) r = nfsvno_getattr(nvp, nvap, nd, p, 1, &attrbits); if (r == 0 && is_zfs == 1 && nfsrv_enable_crossmntpt != 0 && (nd->nd_flag & ND_NFSV4) != 0 && nvp->v_type == VDIR && vp->v_mount != nvp->v_mount) { /* * For a ZFS snapshot, there is a * pseudo mount that does not set * v_mountedhere, so it needs to * be detected via a different * mount structure. */ at_root = 1; if (new_mp == mp) new_mp = nvp->v_mount; } } /* * If we failed to get attributes of the entry, * then just skip it for NFSv3 (the traditional * behavior in the old NFS server). * For NFSv4 the behavior is controlled by * RDATTRERROR: we either ignore the error or * fail the request. * Note that RDATTRERROR is never set for NFSv3. */ if (r != 0) { if (!NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_RDATTRERROR)) { vput(nvp); if (needs_unbusy != 0) vfs_unbusy(new_mp); if ((nd->nd_flag & ND_NFSV3)) goto invalid; nd->nd_repstat = r; break; } } } /* * Build the directory record xdr */ if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = newnfs_true; *tl++ = 0; *tl = txdr_unsigned(dp->d_fileno); dirlen += nfsm_strtom(nd, dp->d_name, nlen); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); txdr_hyper(*cookiep, tl); nfsrv_postopattr(nd, 0, nvap); dirlen += nfsm_fhtom(nd,(u_int8_t *)&nfh,0,1); dirlen += (5*NFSX_UNSIGNED+NFSX_V3POSTOPATTR); if (nvp != NULL) vput(nvp); } else { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = newnfs_true; txdr_hyper(*cookiep, tl); dirlen += nfsm_strtom(nd, dp->d_name, nlen); if (nvp != NULL) { supports_nfsv4acls = nfs_supportsnfsv4acls(nvp); NFSVOPUNLOCK(nvp); } else supports_nfsv4acls = 0; if (refp != NULL) { dirlen += nfsrv_putreferralattr(nd, &savbits, refp, 0, &nd->nd_repstat); if (nd->nd_repstat) { if (nvp != NULL) vrele(nvp); if (needs_unbusy != 0) vfs_unbusy(new_mp); break; } } else if (r) { dirlen += nfsvno_fillattr(nd, new_mp, nvp, nvap, &nfh, r, &rderrbits, nd->nd_cred, p, isdgram, 0, supports_nfsv4acls, at_root, mounted_on_fileno); } else { dirlen += nfsvno_fillattr(nd, new_mp, nvp, nvap, &nfh, r, &attrbits, nd->nd_cred, p, isdgram, 0, supports_nfsv4acls, at_root, mounted_on_fileno); } if (nvp != NULL) vrele(nvp); dirlen += (3 * NFSX_UNSIGNED); } if (needs_unbusy != 0) vfs_unbusy(new_mp); if (dirlen <= cnt) entrycnt++; } invalid: cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } vrele(vp); vfs_unbusy(mp); /* * If dirlen > cnt, we must strip off the last entry. If that * results in an empty reply, report NFSERR_TOOSMALL. */ if (dirlen > cnt || nd->nd_repstat) { if (!nd->nd_repstat && entrycnt == 0) nd->nd_repstat = NFSERR_TOOSMALL; if (nd->nd_repstat) { nfsm_trimtrailing(nd, mb0, bpos0, bextpg0, bextpgsiz0); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); } else nfsm_trimtrailing(nd, mb1, bpos1, bextpg1, bextpgsiz1); eofflag = 0; } else if (cpos < cend) eofflag = 0; if (!nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = newnfs_false; if (eofflag) *tl = newnfs_true; else *tl = newnfs_false; } free(cookies, M_TEMP); free(rbuf, M_TEMP); out: NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * Get the settable attributes out of the mbuf list. * (Return 0 or EBADRPC) */ int nfsrv_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p) { u_int32_t *tl; struct nfsv2_sattr *sp; int error = 0, toclient = 0; switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) { case ND_NFSV2: NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR); /* * Some old clients didn't fill in the high order 16bits. * --> check the low order 2 bytes for 0xffff */ if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff) nvap->na_mode = nfstov_mode(sp->sa_mode); if (sp->sa_uid != newnfs_xdrneg1) nvap->na_uid = fxdr_unsigned(uid_t, sp->sa_uid); if (sp->sa_gid != newnfs_xdrneg1) nvap->na_gid = fxdr_unsigned(gid_t, sp->sa_gid); if (sp->sa_size != newnfs_xdrneg1) nvap->na_size = fxdr_unsigned(u_quad_t, sp->sa_size); if (sp->sa_atime.nfsv2_sec != newnfs_xdrneg1) { #ifdef notyet fxdr_nfsv2time(&sp->sa_atime, &nvap->na_atime); #else nvap->na_atime.tv_sec = fxdr_unsigned(u_int32_t,sp->sa_atime.nfsv2_sec); nvap->na_atime.tv_nsec = 0; #endif } if (sp->sa_mtime.nfsv2_sec != newnfs_xdrneg1) fxdr_nfsv2time(&sp->sa_mtime, &nvap->na_mtime); break; case ND_NFSV3: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nvap->na_mode = nfstov_mode(*tl); } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nvap->na_uid = fxdr_unsigned(uid_t, *tl); } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nvap->na_gid = fxdr_unsigned(gid_t, *tl); } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); nvap->na_size = fxdr_hyper(tl); } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); switch (fxdr_unsigned(int, *tl)) { case NFSV3SATTRTIME_TOCLIENT: NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); fxdr_nfsv3time(tl, &nvap->na_atime); toclient = 1; break; case NFSV3SATTRTIME_TOSERVER: vfs_timestamp(&nvap->na_atime); nvap->na_vaflags |= VA_UTIMES_NULL; break; } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); switch (fxdr_unsigned(int, *tl)) { case NFSV3SATTRTIME_TOCLIENT: NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); fxdr_nfsv3time(tl, &nvap->na_mtime); nvap->na_vaflags &= ~VA_UTIMES_NULL; break; case NFSV3SATTRTIME_TOSERVER: vfs_timestamp(&nvap->na_mtime); if (!toclient) nvap->na_vaflags |= VA_UTIMES_NULL; break; } break; case ND_NFSV4: error = nfsv4_sattr(nd, vp, nvap, attrbitp, aclp, p); } nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * Handle the setable attributes for V4. * Returns NFSERR_BADXDR if it can't be parsed, 0 otherwise. */ int nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p) { u_int32_t *tl; int attrsum = 0; int i, j; int error, attrsize, bitpos, aclsize, aceerr, retnotsup = 0; int moderet, toclient = 0; u_char *cp, namestr[NFSV4_SMALLSTR + 1]; uid_t uid; gid_t gid; u_short mode, mask; /* Same type as va_mode. */ struct vattr va; error = nfsrv_getattrbits(nd, attrbitp, NULL, &retnotsup); if (error) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsize = fxdr_unsigned(int, *tl); /* * Loop around getting the setable attributes. If an unsupported * one is found, set nd_repstat == NFSERR_ATTRNOTSUPP and return. */ if (retnotsup) { nd->nd_repstat = NFSERR_ATTRNOTSUPP; bitpos = NFSATTRBIT_MAX; } else { bitpos = 0; } moderet = 0; for (; bitpos < NFSATTRBIT_MAX; bitpos++) { if (attrsum > attrsize) { error = NFSERR_BADXDR; goto nfsmout; } if (NFSISSET_ATTRBIT(attrbitp, bitpos)) switch (bitpos) { case NFSATTRBIT_SIZE: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); if (vp != NULL && vp->v_type != VREG) { error = (vp->v_type == VDIR) ? NFSERR_ISDIR : NFSERR_INVAL; goto nfsmout; } nvap->na_size = fxdr_hyper(tl); attrsum += NFSX_HYPER; break; case NFSATTRBIT_ACL: error = nfsrv_dissectacl(nd, aclp, true, &aceerr, &aclsize, p); if (error) goto nfsmout; if (aceerr && !nd->nd_repstat) nd->nd_repstat = aceerr; attrsum += aclsize; break; case NFSATTRBIT_ARCHIVE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_HIDDEN: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_MIMETYPE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); error = nfsm_advance(nd, NFSM_RNDUP(i), -1); if (error) goto nfsmout; if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i)); break; case NFSATTRBIT_MODE: moderet = NFSERR_INVAL; /* Can't do MODESETMASKED. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nvap->na_mode = nfstov_mode(*tl); attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_OWNER: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); j = fxdr_unsigned(int, *tl); if (j < 0) { error = NFSERR_BADXDR; goto nfsmout; } if (j > NFSV4_SMALLSTR) cp = malloc(j + 1, M_NFSSTRING, M_WAITOK); else cp = namestr; error = nfsrv_mtostr(nd, cp, j); if (error) { if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); goto nfsmout; } if (!nd->nd_repstat) { nd->nd_repstat = nfsv4_strtouid(nd, cp, j, &uid); if (!nd->nd_repstat) nvap->na_uid = uid; } if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j)); break; case NFSATTRBIT_OWNERGROUP: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); j = fxdr_unsigned(int, *tl); if (j < 0) { error = NFSERR_BADXDR; goto nfsmout; } if (j > NFSV4_SMALLSTR) cp = malloc(j + 1, M_NFSSTRING, M_WAITOK); else cp = namestr; error = nfsrv_mtostr(nd, cp, j); if (error) { if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); goto nfsmout; } if (!nd->nd_repstat) { nd->nd_repstat = nfsv4_strtogid(nd, cp, j, &gid); if (!nd->nd_repstat) nvap->na_gid = gid; } if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j)); break; case NFSATTRBIT_SYSTEM: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_TIMEACCESSSET: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsum += NFSX_UNSIGNED; if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); fxdr_nfsv4time(tl, &nvap->na_atime); toclient = 1; attrsum += NFSX_V4TIME; } else { vfs_timestamp(&nvap->na_atime); nvap->na_vaflags |= VA_UTIMES_NULL; } break; case NFSATTRBIT_TIMEBACKUP: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMECREATE: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); fxdr_nfsv4time(tl, &nvap->na_btime); attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEMODIFYSET: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsum += NFSX_UNSIGNED; if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); fxdr_nfsv4time(tl, &nvap->na_mtime); nvap->na_vaflags &= ~VA_UTIMES_NULL; attrsum += NFSX_V4TIME; } else { vfs_timestamp(&nvap->na_mtime); if (!toclient) nvap->na_vaflags |= VA_UTIMES_NULL; } break; case NFSATTRBIT_MODESETMASKED: NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); mode = fxdr_unsigned(u_short, *tl++); mask = fxdr_unsigned(u_short, *tl); /* * vp == NULL implies an Open/Create operation. * This attribute can only be used for Setattr and * only for NFSv4.1 or higher. * If moderet != 0, a mode attribute has also been * specified and this attribute cannot be done in the * same Setattr operation. */ if ((nd->nd_flag & ND_NFSV41) == 0) nd->nd_repstat = NFSERR_ATTRNOTSUPP; else if ((mode & ~07777) != 0 || (mask & ~07777) != 0 || vp == NULL) nd->nd_repstat = NFSERR_INVAL; else if (moderet == 0) moderet = VOP_GETATTR(vp, &va, nd->nd_cred); if (moderet == 0) nvap->na_mode = (mode & mask) | (va.va_mode & ~mask); else nd->nd_repstat = moderet; attrsum += 2 * NFSX_UNSIGNED; break; default: nd->nd_repstat = NFSERR_ATTRNOTSUPP; /* * set bitpos so we drop out of the loop. */ bitpos = NFSATTRBIT_MAX; break; } } /* * some clients pad the attrlist, so we need to skip over the * padding. */ if (attrsum > attrsize) { error = NFSERR_BADXDR; } else { attrsize = NFSM_RNDUP(attrsize); if (attrsum < attrsize) error = nfsm_advance(nd, attrsize - attrsum, -1); } nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * Check/setup export credentials. */ int nfsd_excred(struct nfsrv_descript *nd, struct nfsexstuff *exp, struct ucred *credanon, bool testsec) { int error; /* * Check/setup credentials. */ if (nd->nd_flag & ND_GSS) exp->nes_exflag &= ~MNT_EXPORTANON; /* * Check to see if the operation is allowed for this security flavor. */ error = 0; if (testsec) { error = nfsvno_testexp(nd, exp); if (error != 0) goto out; } /* * Check to see if the file system is exported V4 only. */ if (NFSVNO_EXV4ONLY(exp) && !(nd->nd_flag & ND_NFSV4)) { error = NFSERR_PROGNOTV4; goto out; } /* * Now, map the user credentials. * (Note that ND_AUTHNONE will only be set for an NFSv3 * Fsinfo RPC. If set for anything else, this code might need * to change.) */ if (NFSVNO_EXPORTED(exp)) { if (((nd->nd_flag & ND_GSS) == 0 && nd->nd_cred->cr_uid == 0) || NFSVNO_EXPORTANON(exp) || (nd->nd_flag & ND_AUTHNONE) != 0) { nd->nd_cred->cr_uid = credanon->cr_uid; nd->nd_cred->cr_gid = credanon->cr_gid; crsetgroups(nd->nd_cred, credanon->cr_ngroups, credanon->cr_groups); } else if ((nd->nd_flag & ND_GSS) == 0) { /* * If using AUTH_SYS, call nfsrv_getgrpscred() to see * if there is a replacement credential with a group * list set up by "nfsuserd -manage-gids". * If there is no replacement, nfsrv_getgrpscred() * simply returns its argument. */ nd->nd_cred = nfsrv_getgrpscred(nd->nd_cred); } } out: NFSEXITCODE2(error, nd); return (error); } /* * Check exports. */ int nfsvno_checkexp(struct mount *mp, struct sockaddr *nam, struct nfsexstuff *exp, struct ucred **credp) { int error; error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp, &exp->nes_numsecflavor, exp->nes_secflavors); if (error) { if (nfs_rootfhset) { exp->nes_exflag = 0; exp->nes_numsecflavor = 0; error = 0; } } else if (exp->nes_numsecflavor < 1 || exp->nes_numsecflavor > MAXSECFLAVORS) { printf("nfsvno_checkexp: numsecflavors out of range\n"); exp->nes_numsecflavor = 0; error = EACCES; } NFSEXITCODE(error); return (error); } /* * Get a vnode for a file handle and export stuff. */ int nfsvno_fhtovp(struct mount *mp, fhandle_t *fhp, struct sockaddr *nam, int lktype, struct vnode **vpp, struct nfsexstuff *exp, struct ucred **credp) { int error; *credp = NULL; exp->nes_numsecflavor = 0; error = VFS_FHTOVP(mp, &fhp->fh_fid, lktype, vpp); if (error != 0) /* Make sure the server replies ESTALE to the client. */ error = ESTALE; if (nam && !error) { error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp, &exp->nes_numsecflavor, exp->nes_secflavors); if (error) { if (nfs_rootfhset) { exp->nes_exflag = 0; exp->nes_numsecflavor = 0; error = 0; } else { vput(*vpp); } } else if (exp->nes_numsecflavor < 1 || exp->nes_numsecflavor > MAXSECFLAVORS) { printf("nfsvno_fhtovp: numsecflavors out of range\n"); exp->nes_numsecflavor = 0; error = EACCES; vput(*vpp); } } NFSEXITCODE(error); return (error); } /* * nfsd_fhtovp() - convert a fh to a vnode ptr * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling nfsvno_fhtovp() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * for AUTH_SYS * - if mpp != NULL, return the mount point so that it can * be used for vn_finished_write() by the caller */ void nfsd_fhtovp(struct nfsrv_descript *nd, struct nfsrvfh *nfp, int lktype, struct vnode **vpp, struct nfsexstuff *exp, struct mount **mpp, int startwrite, int nextop) { struct mount *mp, *mpw; struct ucred *credanon; fhandle_t *fhp; int error; if (mpp != NULL) *mpp = NULL; *vpp = NULL; fhp = (fhandle_t *)nfp->nfsrvfh_data; mp = vfs_busyfs(&fhp->fh_fsid); if (mp == NULL) { nd->nd_repstat = ESTALE; goto out; } if (startwrite) { mpw = mp; error = vn_start_write(NULL, &mpw, V_WAIT); if (error != 0) { mpw = NULL; vfs_unbusy(mp); nd->nd_repstat = ESTALE; goto out; } if (lktype == LK_SHARED && !(MNT_SHARED_WRITES(mp))) lktype = LK_EXCLUSIVE; } else mpw = NULL; nd->nd_repstat = nfsvno_fhtovp(mp, fhp, nd->nd_nam, lktype, vpp, exp, &credanon); vfs_unbusy(mp); /* * For NFSv4 without a pseudo root fs, unexported file handles * can be returned, so that Lookup works everywhere. */ if (!nd->nd_repstat && exp->nes_exflag == 0 && !(nd->nd_flag & ND_NFSV4)) { vput(*vpp); *vpp = NULL; nd->nd_repstat = EACCES; } /* * Personally, I've never seen any point in requiring a * reserved port#, since only in the rare case where the * clients are all boxes with secure system privileges, * does it provide any enhanced security, but... some people * believe it to be useful and keep putting this code back in. * (There is also some "security checker" out there that * complains if the nfs server doesn't enforce this.) * However, note the following: * RFC3530 (NFSv4) specifies that a reserved port# not be * required. * RFC2623 recommends that, if a reserved port# is checked for, * that there be a way to turn that off--> ifdef'd. */ #ifdef NFS_REQRSVPORT if (!nd->nd_repstat) { struct sockaddr_in *saddr; struct sockaddr_in6 *saddr6; saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *); saddr6 = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in6 *); if (!(nd->nd_flag & ND_NFSV4) && ((saddr->sin_family == AF_INET && ntohs(saddr->sin_port) >= IPPORT_RESERVED) || (saddr6->sin6_family == AF_INET6 && ntohs(saddr6->sin6_port) >= IPPORT_RESERVED))) { vput(*vpp); nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK); } } #endif /* NFS_REQRSVPORT */ /* * Check/setup credentials. */ if (!nd->nd_repstat) { nd->nd_saveduid = nd->nd_cred->cr_uid; nd->nd_repstat = nfsd_excred(nd, exp, credanon, nfsrv_checkwrongsec(nd, nextop, (*vpp)->v_type)); if (nd->nd_repstat) vput(*vpp); } if (credanon != NULL) crfree(credanon); if (nd->nd_repstat) { vn_finished_write(mpw); *vpp = NULL; } else if (mpp != NULL) { *mpp = mpw; } out: NFSEXITCODE2(0, nd); } /* * glue for fp. */ static int fp_getfvp(struct thread *p, int fd, struct file **fpp, struct vnode **vpp) { struct filedesc *fdp; struct file *fp; int error = 0; fdp = p->td_proc->p_fd; if (fd < 0 || fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd].fde_file) == NULL) { error = EBADF; goto out; } *fpp = fp; out: NFSEXITCODE(error); return (error); } /* * Called from nfssvc() to update the exports list. Just call * vfs_export(). This has to be done, since the v4 root fake fs isn't * in the mount list. */ int nfsrv_v4rootexport(void *argp, struct ucred *cred, struct thread *p) { struct nfsex_args *nfsexargp = (struct nfsex_args *)argp; int error = 0; struct nameidata nd; fhandle_t fh; error = vfs_export(&nfsv4root_mnt, &nfsexargp->export); if ((nfsexargp->export.ex_flags & MNT_DELEXPORT) != 0) nfs_rootfhset = 0; else if (error == 0) { if (nfsexargp->fspec == NULL) { error = EPERM; goto out; } /* * If fspec != NULL, this is the v4root path. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, nfsexargp->fspec); if ((error = namei(&nd)) != 0) goto out; error = nfsvno_getfh(nd.ni_vp, &fh, p); vrele(nd.ni_vp); if (!error) { nfs_rootfh.nfsrvfh_len = NFSX_MYFH; NFSBCOPY((caddr_t)&fh, nfs_rootfh.nfsrvfh_data, sizeof (fhandle_t)); nfs_rootfhset = 1; } } out: NFSEXITCODE(error); return (error); } /* * This function needs to test to see if the system is near its limit * for memory allocation via malloc() or mget() and return True iff * either of these resources are near their limit. * XXX (For now, this is just a stub.) */ int nfsrv_testmalloclimit = 0; int nfsrv_mallocmget_limit(void) { static int printmesg = 0; static int testval = 1; if (nfsrv_testmalloclimit && (testval++ % 1000) == 0) { if ((printmesg++ % 100) == 0) printf("nfsd: malloc/mget near limit\n"); return (1); } return (0); } /* * BSD specific initialization of a mount point. */ void nfsd_mntinit(void) { static int inited = 0; if (inited) return; inited = 1; nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED); TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist); TAILQ_INIT(&nfsv4root_mnt.mnt_lazyvnodelist); nfsv4root_mnt.mnt_export = NULL; TAILQ_INIT(&nfsv4root_opt); TAILQ_INIT(&nfsv4root_newopt); nfsv4root_mnt.mnt_opt = &nfsv4root_opt; nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt; nfsv4root_mnt.mnt_nvnodelistsize = 0; nfsv4root_mnt.mnt_lazyvnodelistsize = 0; } static void nfsd_timer(void *arg) { nfsrv_servertimer(); callout_reset_sbt(&nfsd_callout, SBT_1S, SBT_1S, nfsd_timer, NULL, 0); } /* * Get a vnode for a file handle, without checking exports, etc. */ struct vnode * nfsvno_getvp(fhandle_t *fhp) { struct mount *mp; struct vnode *vp; int error; mp = vfs_busyfs(&fhp->fh_fsid); if (mp == NULL) return (NULL); error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error) return (NULL); return (vp); } /* * Do a local VOP_ADVLOCK(). */ int nfsvno_advlock(struct vnode *vp, int ftype, u_int64_t first, u_int64_t end, struct thread *td) { int error = 0; struct flock fl; u_int64_t tlen; if (nfsrv_dolocallocks == 0) goto out; ASSERT_VOP_UNLOCKED(vp, "nfsvno_advlock: vp locked"); fl.l_whence = SEEK_SET; fl.l_type = ftype; fl.l_start = (off_t)first; if (end == NFS64BITSSET) { fl.l_len = 0; } else { tlen = end - first; fl.l_len = (off_t)tlen; } /* * For FreeBSD8, the l_pid and l_sysid must be set to the same * values for all calls, so that all locks will be held by the * nfsd server. (The nfsd server handles conflicts between the * various clients.) * Since an NFSv4 lockowner is a ClientID plus an array of up to 1024 * bytes, so it can't be put in l_sysid. */ if (nfsv4_sysid == 0) nfsv4_sysid = nlm_acquire_next_sysid(); fl.l_pid = (pid_t)0; fl.l_sysid = (int)nfsv4_sysid; if (ftype == F_UNLCK) error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_UNLCK, &fl, (F_POSIX | F_REMOTE)); else error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_SETLK, &fl, (F_POSIX | F_REMOTE)); out: NFSEXITCODE(error); return (error); } /* * Check the nfsv4 root exports. */ int nfsvno_v4rootexport(struct nfsrv_descript *nd) { struct ucred *credanon; int error = 0, numsecflavor, secflavors[MAXSECFLAVORS], i; uint64_t exflags; error = vfs_stdcheckexp(&nfsv4root_mnt, nd->nd_nam, &exflags, &credanon, &numsecflavor, secflavors); if (error) { error = NFSERR_PROGUNAVAIL; goto out; } if (credanon != NULL) crfree(credanon); for (i = 0; i < numsecflavor; i++) { if (secflavors[i] == AUTH_SYS) nd->nd_flag |= ND_EXAUTHSYS; else if (secflavors[i] == RPCSEC_GSS_KRB5) nd->nd_flag |= ND_EXGSS; else if (secflavors[i] == RPCSEC_GSS_KRB5I) nd->nd_flag |= ND_EXGSSINTEGRITY; else if (secflavors[i] == RPCSEC_GSS_KRB5P) nd->nd_flag |= ND_EXGSSPRIVACY; } /* And set ND_EXxx flags for TLS. */ if ((exflags & MNT_EXTLS) != 0) { nd->nd_flag |= ND_EXTLS; if ((exflags & MNT_EXTLSCERT) != 0) nd->nd_flag |= ND_EXTLSCERT; if ((exflags & MNT_EXTLSCERTUSER) != 0) nd->nd_flag |= ND_EXTLSCERTUSER; } out: NFSEXITCODE(error); return (error); } /* * Nfs server pseudo system call for the nfsd's */ /* * MPSAFE */ static int nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap) { struct file *fp; struct nfsd_addsock_args sockarg; struct nfsd_nfsd_args nfsdarg; struct nfsd_nfsd_oargs onfsdarg; struct nfsd_pnfsd_args pnfsdarg; struct vnode *vp, *nvp, *curdvp; struct pnfsdsfile *pf; struct nfsdevice *ds, *fds; cap_rights_t rights; int buflen, error, ret; char *buf, *cp, *cp2, *cp3; char fname[PNFS_FILENAME_LEN + 1]; if (uap->flag & NFSSVC_NFSDADDSOCK) { error = copyin(uap->argp, (caddr_t)&sockarg, sizeof (sockarg)); if (error) goto out; /* * Since we don't know what rights might be required, * pretend that we need them all. It is better to be too * careful than too reckless. */ error = fget(td, sockarg.sock, cap_rights_init_one(&rights, CAP_SOCK_SERVER), &fp); if (error != 0) goto out; if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); error = EPERM; goto out; } error = nfsrvd_addsock(fp); fdrop(fp, td); } else if (uap->flag & NFSSVC_NFSDNFSD) { if (uap->argp == NULL) { error = EINVAL; goto out; } if ((uap->flag & NFSSVC_NEWSTRUCT) == 0) { error = copyin(uap->argp, &onfsdarg, sizeof(onfsdarg)); if (error == 0) { nfsdarg.principal = onfsdarg.principal; nfsdarg.minthreads = onfsdarg.minthreads; nfsdarg.maxthreads = onfsdarg.maxthreads; nfsdarg.version = 1; nfsdarg.addr = NULL; nfsdarg.addrlen = 0; nfsdarg.dnshost = NULL; nfsdarg.dnshostlen = 0; nfsdarg.dspath = NULL; nfsdarg.dspathlen = 0; nfsdarg.mdspath = NULL; nfsdarg.mdspathlen = 0; nfsdarg.mirrorcnt = 1; } } else error = copyin(uap->argp, &nfsdarg, sizeof(nfsdarg)); if (error) goto out; if (nfsdarg.addrlen > 0 && nfsdarg.addrlen < 10000 && nfsdarg.dnshostlen > 0 && nfsdarg.dnshostlen < 10000 && nfsdarg.dspathlen > 0 && nfsdarg.dspathlen < 10000 && nfsdarg.mdspathlen > 0 && nfsdarg.mdspathlen < 10000 && nfsdarg.mirrorcnt >= 1 && nfsdarg.mirrorcnt <= NFSDEV_MAXMIRRORS && nfsdarg.addr != NULL && nfsdarg.dnshost != NULL && nfsdarg.dspath != NULL && nfsdarg.mdspath != NULL) { NFSD_DEBUG(1, "addrlen=%d dspathlen=%d dnslen=%d" " mdspathlen=%d mirrorcnt=%d\n", nfsdarg.addrlen, nfsdarg.dspathlen, nfsdarg.dnshostlen, nfsdarg.mdspathlen, nfsdarg.mirrorcnt); cp = malloc(nfsdarg.addrlen + 1, M_TEMP, M_WAITOK); error = copyin(nfsdarg.addr, cp, nfsdarg.addrlen); if (error != 0) { free(cp, M_TEMP); goto out; } cp[nfsdarg.addrlen] = '\0'; /* Ensure nul term. */ nfsdarg.addr = cp; cp = malloc(nfsdarg.dnshostlen + 1, M_TEMP, M_WAITOK); error = copyin(nfsdarg.dnshost, cp, nfsdarg.dnshostlen); if (error != 0) { free(nfsdarg.addr, M_TEMP); free(cp, M_TEMP); goto out; } cp[nfsdarg.dnshostlen] = '\0'; /* Ensure nul term. */ nfsdarg.dnshost = cp; cp = malloc(nfsdarg.dspathlen + 1, M_TEMP, M_WAITOK); error = copyin(nfsdarg.dspath, cp, nfsdarg.dspathlen); if (error != 0) { free(nfsdarg.addr, M_TEMP); free(nfsdarg.dnshost, M_TEMP); free(cp, M_TEMP); goto out; } cp[nfsdarg.dspathlen] = '\0'; /* Ensure nul term. */ nfsdarg.dspath = cp; cp = malloc(nfsdarg.mdspathlen + 1, M_TEMP, M_WAITOK); error = copyin(nfsdarg.mdspath, cp, nfsdarg.mdspathlen); if (error != 0) { free(nfsdarg.addr, M_TEMP); free(nfsdarg.dnshost, M_TEMP); free(nfsdarg.dspath, M_TEMP); free(cp, M_TEMP); goto out; } cp[nfsdarg.mdspathlen] = '\0'; /* Ensure nul term. */ nfsdarg.mdspath = cp; } else { nfsdarg.addr = NULL; nfsdarg.addrlen = 0; nfsdarg.dnshost = NULL; nfsdarg.dnshostlen = 0; nfsdarg.dspath = NULL; nfsdarg.dspathlen = 0; nfsdarg.mdspath = NULL; nfsdarg.mdspathlen = 0; nfsdarg.mirrorcnt = 1; } nfsd_timer(NULL); error = nfsrvd_nfsd(td, &nfsdarg); free(nfsdarg.addr, M_TEMP); free(nfsdarg.dnshost, M_TEMP); free(nfsdarg.dspath, M_TEMP); free(nfsdarg.mdspath, M_TEMP); } else if (uap->flag & NFSSVC_PNFSDS) { error = copyin(uap->argp, &pnfsdarg, sizeof(pnfsdarg)); if (error == 0 && (pnfsdarg.op == PNFSDOP_DELDSSERVER || pnfsdarg.op == PNFSDOP_FORCEDELDS)) { cp = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK); error = copyinstr(pnfsdarg.dspath, cp, PATH_MAX + 1, NULL); if (error == 0) error = nfsrv_deldsserver(pnfsdarg.op, cp, td); free(cp, M_TEMP); } else if (error == 0 && pnfsdarg.op == PNFSDOP_COPYMR) { cp = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK); buflen = sizeof(*pf) * NFSDEV_MAXMIRRORS; buf = malloc(buflen, M_TEMP, M_WAITOK); error = copyinstr(pnfsdarg.mdspath, cp, PATH_MAX + 1, NULL); NFSD_DEBUG(4, "pnfsdcopymr cp mdspath=%d\n", error); if (error == 0 && pnfsdarg.dspath != NULL) { cp2 = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK); error = copyinstr(pnfsdarg.dspath, cp2, PATH_MAX + 1, NULL); NFSD_DEBUG(4, "pnfsdcopymr cp dspath=%d\n", error); } else cp2 = NULL; if (error == 0 && pnfsdarg.curdspath != NULL) { cp3 = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK); error = copyinstr(pnfsdarg.curdspath, cp3, PATH_MAX + 1, NULL); NFSD_DEBUG(4, "pnfsdcopymr cp curdspath=%d\n", error); } else cp3 = NULL; curdvp = NULL; fds = NULL; if (error == 0) error = nfsrv_mdscopymr(cp, cp2, cp3, buf, &buflen, fname, td, &vp, &nvp, &pf, &ds, &fds); NFSD_DEBUG(4, "nfsrv_mdscopymr=%d\n", error); if (error == 0) { if (pf->dsf_dir >= nfsrv_dsdirsize) { printf("copymr: dsdir out of range\n"); pf->dsf_dir = 0; } NFSD_DEBUG(4, "copymr: buflen=%d\n", buflen); error = nfsrv_copymr(vp, nvp, ds->nfsdev_dsdir[pf->dsf_dir], ds, pf, (struct pnfsdsfile *)buf, buflen / sizeof(*pf), td->td_ucred, td); vput(vp); vput(nvp); if (fds != NULL && error == 0) { curdvp = fds->nfsdev_dsdir[pf->dsf_dir]; ret = vn_lock(curdvp, LK_EXCLUSIVE); if (ret == 0) { nfsrv_dsremove(curdvp, fname, td->td_ucred, td); NFSVOPUNLOCK(curdvp); } } NFSD_DEBUG(4, "nfsrv_copymr=%d\n", error); } free(cp, M_TEMP); free(cp2, M_TEMP); free(cp3, M_TEMP); free(buf, M_TEMP); } } else { error = nfssvc_srvcall(td, uap, td->td_ucred); } out: NFSEXITCODE(error); return (error); } static int nfssvc_srvcall(struct thread *p, struct nfssvc_args *uap, struct ucred *cred) { struct nfsex_args export; struct nfsex_oldargs oexp; struct file *fp = NULL; int stablefd, i, len; struct nfsd_clid adminrevoke; struct nfsd_dumplist dumplist; struct nfsd_dumpclients *dumpclients; struct nfsd_dumplocklist dumplocklist; struct nfsd_dumplocks *dumplocks; struct nameidata nd; vnode_t vp; int error = EINVAL, igotlock; struct proc *procp; gid_t *grps; static int suspend_nfsd = 0; if (uap->flag & NFSSVC_PUBLICFH) { NFSBZERO((caddr_t)&nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t)); error = copyin(uap->argp, &nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t)); if (!error) nfs_pubfhset = 1; } else if ((uap->flag & (NFSSVC_V4ROOTEXPORT | NFSSVC_NEWSTRUCT)) == (NFSSVC_V4ROOTEXPORT | NFSSVC_NEWSTRUCT)) { error = copyin(uap->argp,(caddr_t)&export, sizeof (struct nfsex_args)); if (!error) { grps = NULL; if (export.export.ex_ngroups > NGROUPS_MAX || export.export.ex_ngroups < 0) error = EINVAL; else if (export.export.ex_ngroups > 0) { grps = malloc(export.export.ex_ngroups * sizeof(gid_t), M_TEMP, M_WAITOK); error = copyin(export.export.ex_groups, grps, export.export.ex_ngroups * sizeof(gid_t)); export.export.ex_groups = grps; } else export.export.ex_groups = NULL; if (!error) error = nfsrv_v4rootexport(&export, cred, p); free(grps, M_TEMP); } } else if ((uap->flag & (NFSSVC_V4ROOTEXPORT | NFSSVC_NEWSTRUCT)) == NFSSVC_V4ROOTEXPORT) { error = copyin(uap->argp,(caddr_t)&oexp, sizeof (struct nfsex_oldargs)); if (!error) { memset(&export.export, 0, sizeof(export.export)); export.export.ex_flags = (uint64_t)oexp.export.ex_flags; export.export.ex_root = oexp.export.ex_root; export.export.ex_uid = oexp.export.ex_anon.cr_uid; export.export.ex_ngroups = oexp.export.ex_anon.cr_ngroups; export.export.ex_groups = NULL; if (export.export.ex_ngroups > XU_NGROUPS || export.export.ex_ngroups < 0) error = EINVAL; else if (export.export.ex_ngroups > 0) { export.export.ex_groups = malloc( export.export.ex_ngroups * sizeof(gid_t), M_TEMP, M_WAITOK); for (i = 0; i < export.export.ex_ngroups; i++) export.export.ex_groups[i] = oexp.export.ex_anon.cr_groups[i]; } export.export.ex_addr = oexp.export.ex_addr; export.export.ex_addrlen = oexp.export.ex_addrlen; export.export.ex_mask = oexp.export.ex_mask; export.export.ex_masklen = oexp.export.ex_masklen; export.export.ex_indexfile = oexp.export.ex_indexfile; export.export.ex_numsecflavors = oexp.export.ex_numsecflavors; if (export.export.ex_numsecflavors >= MAXSECFLAVORS || export.export.ex_numsecflavors < 0) error = EINVAL; else { for (i = 0; i < export.export.ex_numsecflavors; i++) export.export.ex_secflavors[i] = oexp.export.ex_secflavors[i]; } export.fspec = oexp.fspec; if (error == 0) error = nfsrv_v4rootexport(&export, cred, p); free(export.export.ex_groups, M_TEMP); } } else if (uap->flag & NFSSVC_NOPUBLICFH) { nfs_pubfhset = 0; error = 0; } else if (uap->flag & NFSSVC_STABLERESTART) { error = copyin(uap->argp, (caddr_t)&stablefd, sizeof (int)); if (!error) error = fp_getfvp(p, stablefd, &fp, &vp); if (!error && (NFSFPFLAG(fp) & (FREAD | FWRITE)) != (FREAD | FWRITE)) error = EBADF; if (!error && newnfs_numnfsd != 0) error = EPERM; if (!error) { nfsrv_stablefirst.nsf_fp = fp; nfsrv_setupstable(p); } } else if (uap->flag & NFSSVC_ADMINREVOKE) { error = copyin(uap->argp, (caddr_t)&adminrevoke, sizeof (struct nfsd_clid)); if (!error) error = nfsrv_adminrevoke(&adminrevoke, p); } else if (uap->flag & NFSSVC_DUMPCLIENTS) { error = copyin(uap->argp, (caddr_t)&dumplist, sizeof (struct nfsd_dumplist)); if (!error && (dumplist.ndl_size < 1 || dumplist.ndl_size > NFSRV_MAXDUMPLIST)) error = EPERM; if (!error) { len = sizeof (struct nfsd_dumpclients) * dumplist.ndl_size; dumpclients = malloc(len, M_TEMP, M_WAITOK | M_ZERO); nfsrv_dumpclients(dumpclients, dumplist.ndl_size); error = copyout(dumpclients, dumplist.ndl_list, len); free(dumpclients, M_TEMP); } } else if (uap->flag & NFSSVC_DUMPLOCKS) { error = copyin(uap->argp, (caddr_t)&dumplocklist, sizeof (struct nfsd_dumplocklist)); if (!error && (dumplocklist.ndllck_size < 1 || dumplocklist.ndllck_size > NFSRV_MAXDUMPLIST)) error = EPERM; if (!error) error = nfsrv_lookupfilename(&nd, dumplocklist.ndllck_fname, p); if (!error) { len = sizeof (struct nfsd_dumplocks) * dumplocklist.ndllck_size; dumplocks = malloc(len, M_TEMP, M_WAITOK | M_ZERO); nfsrv_dumplocks(nd.ni_vp, dumplocks, dumplocklist.ndllck_size, p); vput(nd.ni_vp); error = copyout(dumplocks, dumplocklist.ndllck_list, len); free(dumplocks, M_TEMP); } } else if (uap->flag & NFSSVC_BACKUPSTABLE) { procp = p->td_proc; PROC_LOCK(procp); nfsd_master_pid = procp->p_pid; bcopy(procp->p_comm, nfsd_master_comm, MAXCOMLEN + 1); nfsd_master_start = procp->p_stats->p_start; nfsd_master_proc = procp; PROC_UNLOCK(procp); } else if ((uap->flag & NFSSVC_SUSPENDNFSD) != 0) { NFSLOCKV4ROOTMUTEX(); if (suspend_nfsd == 0) { /* Lock out all nfsd threads */ do { igotlock = nfsv4_lock(&nfsd_suspend_lock, 1, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL); } while (igotlock == 0 && suspend_nfsd == 0); suspend_nfsd = 1; } NFSUNLOCKV4ROOTMUTEX(); error = 0; } else if ((uap->flag & NFSSVC_RESUMENFSD) != 0) { NFSLOCKV4ROOTMUTEX(); if (suspend_nfsd != 0) { nfsv4_unlock(&nfsd_suspend_lock, 0); suspend_nfsd = 0; } NFSUNLOCKV4ROOTMUTEX(); error = 0; } NFSEXITCODE(error); return (error); } /* * Check exports. * Returns 0 if ok, 1 otherwise. */ int nfsvno_testexp(struct nfsrv_descript *nd, struct nfsexstuff *exp) { int i; if ((NFSVNO_EXTLS(exp) && (nd->nd_flag & ND_TLS) == 0) || (NFSVNO_EXTLSCERT(exp) && (nd->nd_flag & ND_TLSCERT) == 0) || (NFSVNO_EXTLSCERTUSER(exp) && (nd->nd_flag & ND_TLSCERTUSER) == 0)) { if ((nd->nd_flag & ND_NFSV4) != 0) return (NFSERR_WRONGSEC); #ifdef notnow /* There is currently no auth_stat for this. */ else if ((nd->nd_flag & ND_TLS) == 0) return (NFSERR_AUTHERR | AUTH_NEEDS_TLS); else return (NFSERR_AUTHERR | AUTH_NEEDS_TLS_MUTUAL_HOST); #endif else return (NFSERR_AUTHERR | AUTH_TOOWEAK); } /* * RFC2623 suggests that the NFSv3 Fsinfo RPC be allowed to use * AUTH_NONE or AUTH_SYS for file systems requiring RPCSEC_GSS. */ if ((nd->nd_flag & ND_NFSV3) != 0 && nd->nd_procnum == NFSPROC_FSINFO) return (0); /* * This seems odd, but allow the case where the security flavor * list is empty. This happens when NFSv4 is traversing non-exported * file systems. Exported file systems should always have a non-empty * security flavor list. */ if (exp->nes_numsecflavor == 0) return (0); for (i = 0; i < exp->nes_numsecflavor; i++) { /* * The tests for privacy and integrity must be first, * since ND_GSS is set for everything but AUTH_SYS. */ if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5P && (nd->nd_flag & ND_GSSPRIVACY)) return (0); if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5I && (nd->nd_flag & ND_GSSINTEGRITY)) return (0); if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5 && (nd->nd_flag & ND_GSS)) return (0); if (exp->nes_secflavors[i] == AUTH_SYS && (nd->nd_flag & ND_GSS) == 0) return (0); } if ((nd->nd_flag & ND_NFSV4) != 0) return (NFSERR_WRONGSEC); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } /* * Calculate a hash value for the fid in a file handle. */ uint32_t nfsrv_hashfh(fhandle_t *fhp) { uint32_t hashval; hashval = hash32_buf(&fhp->fh_fid, sizeof(struct fid), 0); return (hashval); } /* * Calculate a hash value for the sessionid. */ uint32_t nfsrv_hashsessionid(uint8_t *sessionid) { uint32_t hashval; hashval = hash32_buf(sessionid, NFSX_V4SESSIONID, 0); return (hashval); } /* * Signal the userland master nfsd to backup the stable restart file. */ void nfsrv_backupstable(void) { struct proc *procp; if (nfsd_master_proc != NULL) { procp = pfind(nfsd_master_pid); /* Try to make sure it is the correct process. */ if (procp == nfsd_master_proc && procp->p_stats->p_start.tv_sec == nfsd_master_start.tv_sec && procp->p_stats->p_start.tv_usec == nfsd_master_start.tv_usec && strcmp(procp->p_comm, nfsd_master_comm) == 0) kern_psignal(procp, SIGUSR2); else nfsd_master_proc = NULL; if (procp != NULL) PROC_UNLOCK(procp); } } /* * Create a DS data file for nfsrv_pnfscreate(). Called for each mirror. * The arguments are in a structure, so that they can be passed through * taskqueue for a kernel process to execute this function. */ struct nfsrvdscreate { int done; int inprog; struct task tsk; struct ucred *tcred; struct vnode *dvp; NFSPROC_T *p; struct pnfsdsfile *pf; int err; fhandle_t fh; struct vattr va; struct vattr createva; }; int nfsrv_dscreate(struct vnode *dvp, struct vattr *vap, struct vattr *nvap, fhandle_t *fhp, struct pnfsdsfile *pf, struct pnfsdsattr *dsa, char *fnamep, struct ucred *tcred, NFSPROC_T *p, struct vnode **nvpp) { struct vnode *nvp; struct nameidata named; struct vattr va; char *bufp; u_long *hashp; struct nfsnode *np; struct nfsmount *nmp; int error; NFSNAMEICNDSET(&named.ni_cnd, tcred, CREATE, LOCKPARENT | LOCKLEAF | SAVESTART | NOCACHE); nfsvno_setpathbuf(&named, &bufp, &hashp); named.ni_cnd.cn_lkflags = LK_EXCLUSIVE; named.ni_cnd.cn_nameptr = bufp; if (fnamep != NULL) { strlcpy(bufp, fnamep, PNFS_FILENAME_LEN + 1); named.ni_cnd.cn_namelen = strlen(bufp); } else named.ni_cnd.cn_namelen = nfsrv_putfhname(fhp, bufp); NFSD_DEBUG(4, "nfsrv_dscreate: dvp=%p fname=%s\n", dvp, bufp); /* Create the date file in the DS mount. */ error = NFSVOPLOCK(dvp, LK_EXCLUSIVE); if (error == 0) { error = VOP_CREATE(dvp, &nvp, &named.ni_cnd, vap); vref(dvp); VOP_VPUT_PAIR(dvp, error == 0 ? &nvp : NULL, false); if (error == 0) { /* Set the ownership of the file. */ error = VOP_SETATTR(nvp, nvap, tcred); NFSD_DEBUG(4, "nfsrv_dscreate:" " setattr-uid=%d\n", error); if (error != 0) vput(nvp); } if (error != 0) printf("pNFS: pnfscreate failed=%d\n", error); } else printf("pNFS: pnfscreate vnlock=%d\n", error); if (error == 0) { np = VTONFS(nvp); nmp = VFSTONFS(nvp->v_mount); if (strcmp(nvp->v_mount->mnt_vfc->vfc_name, "nfs") != 0 || nmp->nm_nam->sa_len > sizeof( struct sockaddr_in6) || np->n_fhp->nfh_len != NFSX_MYFH) { printf("Bad DS file: fstype=%s salen=%d" " fhlen=%d\n", nvp->v_mount->mnt_vfc->vfc_name, nmp->nm_nam->sa_len, np->n_fhp->nfh_len); error = ENOENT; } /* Set extattrs for the DS on the MDS file. */ if (error == 0) { if (dsa != NULL) { error = VOP_GETATTR(nvp, &va, tcred); if (error == 0) { dsa->dsa_filerev = va.va_filerev; dsa->dsa_size = va.va_size; dsa->dsa_atime = va.va_atime; dsa->dsa_mtime = va.va_mtime; dsa->dsa_bytes = va.va_bytes; } } if (error == 0) { NFSBCOPY(np->n_fhp->nfh_fh, &pf->dsf_fh, NFSX_MYFH); NFSBCOPY(nmp->nm_nam, &pf->dsf_sin, nmp->nm_nam->sa_len); NFSBCOPY(named.ni_cnd.cn_nameptr, pf->dsf_filename, sizeof(pf->dsf_filename)); } } else printf("pNFS: pnfscreate can't get DS" " attr=%d\n", error); if (nvpp != NULL && error == 0) *nvpp = nvp; else vput(nvp); } nfsvno_relpathbuf(&named); return (error); } /* * Start up the thread that will execute nfsrv_dscreate(). */ static void start_dscreate(void *arg, int pending) { struct nfsrvdscreate *dsc; dsc = (struct nfsrvdscreate *)arg; dsc->err = nfsrv_dscreate(dsc->dvp, &dsc->createva, &dsc->va, &dsc->fh, dsc->pf, NULL, NULL, dsc->tcred, dsc->p, NULL); dsc->done = 1; NFSD_DEBUG(4, "start_dscreate: err=%d\n", dsc->err); } /* * Create a pNFS data file on the Data Server(s). */ static void nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, NFSPROC_T *p) { struct nfsrvdscreate *dsc, *tdsc = NULL; struct nfsdevice *ds, *tds, *fds; struct mount *mp; struct pnfsdsfile *pf, *tpf; struct pnfsdsattr dsattr; struct vattr va; struct vnode *dvp[NFSDEV_MAXMIRRORS]; struct nfsmount *nmp; fhandle_t fh; uid_t vauid; gid_t vagid; u_short vamode; struct ucred *tcred; int dsdir[NFSDEV_MAXMIRRORS], error, i, mirrorcnt, ret; int failpos, timo; /* Get a DS server directory in a round-robin order. */ mirrorcnt = 1; mp = vp->v_mount; ds = fds = NULL; NFSDDSLOCK(); /* * Search for the first entry that handles this MDS fs, but use the * first entry for all MDS fs's otherwise. */ TAILQ_FOREACH(tds, &nfsrv_devidhead, nfsdev_list) { if (tds->nfsdev_nmp != NULL) { if (tds->nfsdev_mdsisset == 0 && ds == NULL) ds = tds; else if (tds->nfsdev_mdsisset != 0 && fsidcmp( &mp->mnt_stat.f_fsid, &tds->nfsdev_mdsfsid) == 0) { ds = fds = tds; break; } } } if (ds == NULL) { NFSDDSUNLOCK(); NFSD_DEBUG(4, "nfsrv_pnfscreate: no srv\n"); return; } i = dsdir[0] = ds->nfsdev_nextdir; ds->nfsdev_nextdir = (ds->nfsdev_nextdir + 1) % nfsrv_dsdirsize; dvp[0] = ds->nfsdev_dsdir[i]; tds = TAILQ_NEXT(ds, nfsdev_list); if (nfsrv_maxpnfsmirror > 1 && tds != NULL) { TAILQ_FOREACH_FROM(tds, &nfsrv_devidhead, nfsdev_list) { if (tds->nfsdev_nmp != NULL && ((tds->nfsdev_mdsisset == 0 && fds == NULL) || (tds->nfsdev_mdsisset != 0 && fds != NULL && fsidcmp(&mp->mnt_stat.f_fsid, &tds->nfsdev_mdsfsid) == 0))) { dsdir[mirrorcnt] = i; dvp[mirrorcnt] = tds->nfsdev_dsdir[i]; mirrorcnt++; if (mirrorcnt >= nfsrv_maxpnfsmirror) break; } } } /* Put at end of list to implement round-robin usage. */ TAILQ_REMOVE(&nfsrv_devidhead, ds, nfsdev_list); TAILQ_INSERT_TAIL(&nfsrv_devidhead, ds, nfsdev_list); NFSDDSUNLOCK(); dsc = NULL; if (mirrorcnt > 1) tdsc = dsc = malloc(sizeof(*dsc) * (mirrorcnt - 1), M_TEMP, M_WAITOK | M_ZERO); tpf = pf = malloc(sizeof(*pf) * nfsrv_maxpnfsmirror, M_TEMP, M_WAITOK | M_ZERO); error = nfsvno_getfh(vp, &fh, p); if (error == 0) error = VOP_GETATTR(vp, &va, cred); if (error == 0) { /* Set the attributes for "vp" to Setattr the DS vp. */ vauid = va.va_uid; vagid = va.va_gid; vamode = va.va_mode; VATTR_NULL(&va); va.va_uid = vauid; va.va_gid = vagid; va.va_mode = vamode; va.va_size = 0; } else printf("pNFS: pnfscreate getfh+attr=%d\n", error); NFSD_DEBUG(4, "nfsrv_pnfscreate: cruid=%d crgid=%d\n", cred->cr_uid, cred->cr_gid); /* Make data file name based on FH. */ tcred = newnfs_getcred(); /* * Create the file on each DS mirror, using kernel process(es) for the * additional mirrors. */ failpos = -1; for (i = 0; i < mirrorcnt - 1 && error == 0; i++, tpf++, tdsc++) { tpf->dsf_dir = dsdir[i]; tdsc->tcred = tcred; tdsc->p = p; tdsc->pf = tpf; tdsc->createva = *vap; NFSBCOPY(&fh, &tdsc->fh, sizeof(fh)); tdsc->va = va; tdsc->dvp = dvp[i]; tdsc->done = 0; tdsc->inprog = 0; tdsc->err = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_dscreate, tdsc); NFSD_DEBUG(4, "nfsrv_pnfscreate: nfs_pnfsio=%d\n", ret); } if (ret != 0) { ret = nfsrv_dscreate(dvp[i], vap, &va, &fh, tpf, NULL, NULL, tcred, p, NULL); if (ret != 0) { KASSERT(error == 0, ("nfsrv_dscreate err=%d", error)); if (failpos == -1 && nfsds_failerr(ret)) failpos = i; else error = ret; } } } if (error == 0) { tpf->dsf_dir = dsdir[mirrorcnt - 1]; error = nfsrv_dscreate(dvp[mirrorcnt - 1], vap, &va, &fh, tpf, &dsattr, NULL, tcred, p, NULL); if (failpos == -1 && mirrorcnt > 1 && nfsds_failerr(error)) { failpos = mirrorcnt - 1; error = 0; } } timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; /* Wait for kernel task(s) to complete. */ for (tdsc = dsc, i = 0; i < mirrorcnt - 1; i++, tdsc++) { while (tdsc->inprog != 0 && tdsc->done == 0) tsleep(&tdsc->tsk, PVFS, "srvdcr", timo); if (tdsc->err != 0) { if (failpos == -1 && nfsds_failerr(tdsc->err)) failpos = i; else if (error == 0) error = tdsc->err; } } /* * If failpos has been set, that mirror has failed, so it needs * to be disabled. */ if (failpos >= 0) { nmp = VFSTONFS(dvp[failpos]->v_mount); NFSLOCKMNT(nmp); if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM | NFSMNTP_CANCELRPCS)) == 0) { nmp->nm_privflag |= NFSMNTP_CANCELRPCS; NFSUNLOCKMNT(nmp); ds = nfsrv_deldsnmp(PNFSDOP_DELDSSERVER, nmp, p); NFSD_DEBUG(4, "dscreatfail fail=%d ds=%p\n", failpos, ds); if (ds != NULL) nfsrv_killrpcs(nmp); NFSLOCKMNT(nmp); nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS; wakeup(nmp); } NFSUNLOCKMNT(nmp); } NFSFREECRED(tcred); if (error == 0) { ASSERT_VOP_ELOCKED(vp, "nfsrv_pnfscreate vp"); NFSD_DEBUG(4, "nfsrv_pnfscreate: mirrorcnt=%d maxmirror=%d\n", mirrorcnt, nfsrv_maxpnfsmirror); /* * For all mirrors that couldn't be created, fill in the * *pf structure, but with an IP address == 0.0.0.0. */ tpf = pf + mirrorcnt; for (i = mirrorcnt; i < nfsrv_maxpnfsmirror; i++, tpf++) { *tpf = *pf; tpf->dsf_sin.sin_family = AF_INET; tpf->dsf_sin.sin_len = sizeof(struct sockaddr_in); tpf->dsf_sin.sin_addr.s_addr = 0; tpf->dsf_sin.sin_port = 0; } error = vn_extattr_set(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile", sizeof(*pf) * nfsrv_maxpnfsmirror, (char *)pf, p); if (error == 0) error = vn_extattr_set(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsattr", sizeof(dsattr), (char *)&dsattr, p); if (error != 0) printf("pNFS: pnfscreate setextattr=%d\n", error); } else printf("pNFS: pnfscreate=%d\n", error); free(pf, M_TEMP); free(dsc, M_TEMP); } /* * Get the information needed to remove the pNFS Data Server file from the * Metadata file. Upon success, ddvp is set non-NULL to the locked * DS directory vnode. The caller must unlock *ddvp when done with it. */ static void nfsrv_pnfsremovesetup(struct vnode *vp, NFSPROC_T *p, struct vnode **dvpp, int *mirrorcntp, char *fname, fhandle_t *fhp) { struct vattr va; struct ucred *tcred; char *buf; int buflen, error; dvpp[0] = NULL; /* If not an exported regular file or not a pNFS server, just return. */ if (vp->v_type != VREG || (vp->v_mount->mnt_flag & MNT_EXPORTED) == 0 || nfsrv_devidcnt == 0) return; /* Check to see if this is the last hard link. */ tcred = newnfs_getcred(); error = VOP_GETATTR(vp, &va, tcred); NFSFREECRED(tcred); if (error != 0) { printf("pNFS: nfsrv_pnfsremovesetup getattr=%d\n", error); return; } if (va.va_nlink > 1) return; error = nfsvno_getfh(vp, fhp, p); if (error != 0) { printf("pNFS: nfsrv_pnfsremovesetup getfh=%d\n", error); return; } buflen = 1024; buf = malloc(buflen, M_TEMP, M_WAITOK); /* Get the directory vnode for the DS mount and the file handle. */ error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, mirrorcntp, p, dvpp, NULL, NULL, fname, NULL, NULL, NULL, NULL, NULL); free(buf, M_TEMP); if (error != 0) printf("pNFS: nfsrv_pnfsremovesetup getsockmnt=%d\n", error); } /* * Remove a DS data file for nfsrv_pnfsremove(). Called for each mirror. * The arguments are in a structure, so that they can be passed through * taskqueue for a kernel process to execute this function. */ struct nfsrvdsremove { int done; int inprog; struct task tsk; struct ucred *tcred; struct vnode *dvp; NFSPROC_T *p; int err; char fname[PNFS_FILENAME_LEN + 1]; }; static int nfsrv_dsremove(struct vnode *dvp, char *fname, struct ucred *tcred, NFSPROC_T *p) { struct nameidata named; struct vnode *nvp; char *bufp; u_long *hashp; int error; error = NFSVOPLOCK(dvp, LK_EXCLUSIVE); if (error != 0) return (error); named.ni_cnd.cn_nameiop = DELETE; named.ni_cnd.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; named.ni_cnd.cn_cred = tcred; - named.ni_cnd.cn_flags = ISLASTCN | LOCKPARENT | LOCKLEAF | SAVENAME; + named.ni_cnd.cn_flags = ISLASTCN | LOCKPARENT | LOCKLEAF; nfsvno_setpathbuf(&named, &bufp, &hashp); named.ni_cnd.cn_nameptr = bufp; named.ni_cnd.cn_namelen = strlen(fname); strlcpy(bufp, fname, NAME_MAX); NFSD_DEBUG(4, "nfsrv_pnfsremove: filename=%s\n", bufp); error = VOP_LOOKUP(dvp, &nvp, &named.ni_cnd); NFSD_DEBUG(4, "nfsrv_pnfsremove: aft LOOKUP=%d\n", error); if (error == 0) { error = VOP_REMOVE(dvp, nvp, &named.ni_cnd); vput(nvp); } NFSVOPUNLOCK(dvp); nfsvno_relpathbuf(&named); if (error != 0) printf("pNFS: nfsrv_pnfsremove failed=%d\n", error); return (error); } /* * Start up the thread that will execute nfsrv_dsremove(). */ static void start_dsremove(void *arg, int pending) { struct nfsrvdsremove *dsrm; dsrm = (struct nfsrvdsremove *)arg; dsrm->err = nfsrv_dsremove(dsrm->dvp, dsrm->fname, dsrm->tcred, dsrm->p); dsrm->done = 1; NFSD_DEBUG(4, "start_dsremove: err=%d\n", dsrm->err); } /* * Remove a pNFS data file from a Data Server. * nfsrv_pnfsremovesetup() must have been called before the MDS file was * removed to set up the dvp and fill in the FH. */ static void nfsrv_pnfsremove(struct vnode **dvp, int mirrorcnt, char *fname, fhandle_t *fhp, NFSPROC_T *p) { struct ucred *tcred; struct nfsrvdsremove *dsrm, *tdsrm; struct nfsdevice *ds; struct nfsmount *nmp; int failpos, i, ret, timo; tcred = newnfs_getcred(); dsrm = NULL; if (mirrorcnt > 1) dsrm = malloc(sizeof(*dsrm) * mirrorcnt - 1, M_TEMP, M_WAITOK); /* * Remove the file on each DS mirror, using kernel process(es) for the * additional mirrors. */ failpos = -1; for (tdsrm = dsrm, i = 0; i < mirrorcnt - 1; i++, tdsrm++) { tdsrm->tcred = tcred; tdsrm->p = p; tdsrm->dvp = dvp[i]; strlcpy(tdsrm->fname, fname, PNFS_FILENAME_LEN + 1); tdsrm->inprog = 0; tdsrm->done = 0; tdsrm->err = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_dsremove, tdsrm); NFSD_DEBUG(4, "nfsrv_pnfsremove: nfs_pnfsio=%d\n", ret); } if (ret != 0) { ret = nfsrv_dsremove(dvp[i], fname, tcred, p); if (failpos == -1 && nfsds_failerr(ret)) failpos = i; } } ret = nfsrv_dsremove(dvp[mirrorcnt - 1], fname, tcred, p); if (failpos == -1 && mirrorcnt > 1 && nfsds_failerr(ret)) failpos = mirrorcnt - 1; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; /* Wait for kernel task(s) to complete. */ for (tdsrm = dsrm, i = 0; i < mirrorcnt - 1; i++, tdsrm++) { while (tdsrm->inprog != 0 && tdsrm->done == 0) tsleep(&tdsrm->tsk, PVFS, "srvdsrm", timo); if (failpos == -1 && nfsds_failerr(tdsrm->err)) failpos = i; } /* * If failpos has been set, that mirror has failed, so it needs * to be disabled. */ if (failpos >= 0) { nmp = VFSTONFS(dvp[failpos]->v_mount); NFSLOCKMNT(nmp); if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM | NFSMNTP_CANCELRPCS)) == 0) { nmp->nm_privflag |= NFSMNTP_CANCELRPCS; NFSUNLOCKMNT(nmp); ds = nfsrv_deldsnmp(PNFSDOP_DELDSSERVER, nmp, p); NFSD_DEBUG(4, "dsremovefail fail=%d ds=%p\n", failpos, ds); if (ds != NULL) nfsrv_killrpcs(nmp); NFSLOCKMNT(nmp); nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS; wakeup(nmp); } NFSUNLOCKMNT(nmp); } /* Get rid all layouts for the file. */ nfsrv_freefilelayouts(fhp); NFSFREECRED(tcred); free(dsrm, M_TEMP); } /* * Generate a file name based on the file handle and put it in *bufp. * Return the number of bytes generated. */ static int nfsrv_putfhname(fhandle_t *fhp, char *bufp) { int i; uint8_t *cp; const uint8_t *hexdigits = "0123456789abcdef"; cp = (uint8_t *)fhp; for (i = 0; i < sizeof(*fhp); i++) { bufp[2 * i] = hexdigits[(*cp >> 4) & 0xf]; bufp[2 * i + 1] = hexdigits[*cp++ & 0xf]; } bufp[2 * i] = '\0'; return (2 * i); } /* * Update the Metadata file's attributes from the DS file when a Read/Write * layout is returned. * Basically just call nfsrv_proxyds() with procedure == NFSPROC_LAYOUTRETURN * so that it does a nfsrv_getattrdsrpc() and nfsrv_setextattr() on the DS file. */ int nfsrv_updatemdsattr(struct vnode *vp, struct nfsvattr *nap, NFSPROC_T *p) { struct ucred *tcred; int error; /* Do this as root so that it won't fail with EACCES. */ tcred = newnfs_getcred(); error = nfsrv_proxyds(vp, 0, 0, tcred, p, NFSPROC_LAYOUTRETURN, NULL, NULL, NULL, nap, NULL, NULL, 0, NULL); NFSFREECRED(tcred); return (error); } /* * Set the NFSv4 ACL on the DS file to the same ACL as the MDS file. */ static int nfsrv_dssetacl(struct vnode *vp, struct acl *aclp, struct ucred *cred, NFSPROC_T *p) { int error; error = nfsrv_proxyds(vp, 0, 0, cred, p, NFSPROC_SETACL, NULL, NULL, NULL, NULL, aclp, NULL, 0, NULL); return (error); } static int nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, struct thread *p, int ioproc, struct mbuf **mpp, char *cp, struct mbuf **mpp2, struct nfsvattr *nap, struct acl *aclp, off_t *offp, int content, bool *eofp) { struct nfsmount *nmp[NFSDEV_MAXMIRRORS], *failnmp; fhandle_t fh[NFSDEV_MAXMIRRORS]; struct vnode *dvp[NFSDEV_MAXMIRRORS]; struct nfsdevice *ds; struct pnfsdsattr dsattr; struct opnfsdsattr odsattr; char *buf; int buflen, error, failpos, i, mirrorcnt, origmircnt, trycnt; NFSD_DEBUG(4, "in nfsrv_proxyds\n"); /* * If not a regular file, not exported or not a pNFS server, * just return ENOENT. */ if (vp->v_type != VREG || (vp->v_mount->mnt_flag & MNT_EXPORTED) == 0 || nfsrv_devidcnt == 0) return (ENOENT); buflen = 1024; buf = malloc(buflen, M_TEMP, M_WAITOK); error = 0; /* * For Getattr, get the Change attribute (va_filerev) and size (va_size) * from the MetaData file's extended attribute. */ if (ioproc == NFSPROC_GETATTR) { error = vn_extattr_get(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsattr", &buflen, buf, p); if (error == 0) { if (buflen == sizeof(odsattr)) { NFSBCOPY(buf, &odsattr, buflen); nap->na_filerev = odsattr.dsa_filerev; nap->na_size = odsattr.dsa_size; nap->na_atime = odsattr.dsa_atime; nap->na_mtime = odsattr.dsa_mtime; /* * Fake na_bytes by rounding up na_size. * Since we don't know the block size, just * use BLKDEV_IOSIZE. */ nap->na_bytes = (odsattr.dsa_size + BLKDEV_IOSIZE - 1) & ~(BLKDEV_IOSIZE - 1); } else if (buflen == sizeof(dsattr)) { NFSBCOPY(buf, &dsattr, buflen); nap->na_filerev = dsattr.dsa_filerev; nap->na_size = dsattr.dsa_size; nap->na_atime = dsattr.dsa_atime; nap->na_mtime = dsattr.dsa_mtime; nap->na_bytes = dsattr.dsa_bytes; } else error = ENXIO; } if (error == 0) { /* * If nfsrv_pnfsgetdsattr is 0 or nfsrv_checkdsattr() * returns 0, just return now. nfsrv_checkdsattr() * returns 0 if there is no Read/Write layout * plus either an Open/Write_access or Write * delegation issued to a client for the file. */ if (nfsrv_pnfsgetdsattr == 0 || nfsrv_checkdsattr(vp, p) == 0) { free(buf, M_TEMP); return (error); } } /* * Clear ENOATTR so the code below will attempt to do a * nfsrv_getattrdsrpc() to get the attributes and (re)create * the extended attribute. */ if (error == ENOATTR) error = 0; } origmircnt = -1; trycnt = 0; tryagain: if (error == 0) { buflen = 1024; if (ioproc == NFSPROC_READDS && NFSVOPISLOCKED(vp) == LK_EXCLUSIVE) printf("nfsrv_proxyds: Readds vp exclusively locked\n"); error = nfsrv_dsgetsockmnt(vp, LK_SHARED, buf, &buflen, &mirrorcnt, p, dvp, fh, NULL, NULL, NULL, NULL, NULL, NULL, NULL); if (error == 0) { for (i = 0; i < mirrorcnt; i++) nmp[i] = VFSTONFS(dvp[i]->v_mount); } else printf("pNFS: proxy getextattr sockaddr=%d\n", error); } else printf("pNFS: nfsrv_dsgetsockmnt=%d\n", error); if (error == 0) { failpos = -1; if (origmircnt == -1) origmircnt = mirrorcnt; /* * If failpos is set to a mirror#, then that mirror has * failed and will be disabled. For Read, Getattr and Seek, the * function only tries one mirror, so if that mirror has * failed, it will need to be retried. As such, increment * tryitagain for these cases. * For Write, Setattr and Setacl, the function tries all * mirrors and will not return an error for the case where * one mirror has failed. For these cases, the functioning * mirror(s) will have been modified, so a retry isn't * necessary. These functions will set failpos for the * failed mirror#. */ if (ioproc == NFSPROC_READDS) { error = nfsrv_readdsrpc(fh, off, cnt, cred, p, nmp[0], mpp, mpp2); if (nfsds_failerr(error) && mirrorcnt > 1) { /* * Setting failpos will cause the mirror * to be disabled and then a retry of this * read is required. */ failpos = 0; error = 0; trycnt++; } } else if (ioproc == NFSPROC_WRITEDS) error = nfsrv_writedsrpc(fh, off, cnt, cred, p, vp, &nmp[0], mirrorcnt, mpp, cp, &failpos); else if (ioproc == NFSPROC_SETATTR) error = nfsrv_setattrdsrpc(fh, cred, p, vp, &nmp[0], mirrorcnt, nap, &failpos); else if (ioproc == NFSPROC_SETACL) error = nfsrv_setacldsrpc(fh, cred, p, vp, &nmp[0], mirrorcnt, aclp, &failpos); else if (ioproc == NFSPROC_SEEKDS) { error = nfsrv_seekdsrpc(fh, offp, content, eofp, cred, p, nmp[0]); if (nfsds_failerr(error) && mirrorcnt > 1) { /* * Setting failpos will cause the mirror * to be disabled and then a retry of this * read is required. */ failpos = 0; error = 0; trycnt++; } } else if (ioproc == NFSPROC_ALLOCATE) error = nfsrv_allocatedsrpc(fh, off, *offp, cred, p, vp, &nmp[0], mirrorcnt, &failpos); else if (ioproc == NFSPROC_DEALLOCATE) error = nfsrv_deallocatedsrpc(fh, off, *offp, cred, p, vp, &nmp[0], mirrorcnt, &failpos); else { error = nfsrv_getattrdsrpc(&fh[mirrorcnt - 1], cred, p, vp, nmp[mirrorcnt - 1], nap); if (nfsds_failerr(error) && mirrorcnt > 1) { /* * Setting failpos will cause the mirror * to be disabled and then a retry of this * getattr is required. */ failpos = mirrorcnt - 1; error = 0; trycnt++; } } ds = NULL; if (failpos >= 0) { failnmp = nmp[failpos]; NFSLOCKMNT(failnmp); if ((failnmp->nm_privflag & (NFSMNTP_FORCEDISM | NFSMNTP_CANCELRPCS)) == 0) { failnmp->nm_privflag |= NFSMNTP_CANCELRPCS; NFSUNLOCKMNT(failnmp); ds = nfsrv_deldsnmp(PNFSDOP_DELDSSERVER, failnmp, p); NFSD_DEBUG(4, "dsldsnmp fail=%d ds=%p\n", failpos, ds); if (ds != NULL) nfsrv_killrpcs(failnmp); NFSLOCKMNT(failnmp); failnmp->nm_privflag &= ~NFSMNTP_CANCELRPCS; wakeup(failnmp); } NFSUNLOCKMNT(failnmp); } for (i = 0; i < mirrorcnt; i++) NFSVOPUNLOCK(dvp[i]); NFSD_DEBUG(4, "nfsrv_proxyds: aft RPC=%d trya=%d\n", error, trycnt); /* Try the Read/Getattr again if a mirror was deleted. */ if (ds != NULL && trycnt > 0 && trycnt < origmircnt) goto tryagain; } else { /* Return ENOENT for any Extended Attribute error. */ error = ENOENT; } free(buf, M_TEMP); NFSD_DEBUG(4, "nfsrv_proxyds: error=%d\n", error); return (error); } /* * Get the DS mount point, fh and directory from the "pnfsd.dsfile" extended * attribute. * newnmpp - If it points to a non-NULL nmp, that is the destination and needs * to be checked. If it points to a NULL nmp, then it returns * a suitable destination. * curnmp - If non-NULL, it is the source mount for the copy. */ int nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp, int *mirrorcntp, NFSPROC_T *p, struct vnode **dvpp, fhandle_t *fhp, char *devid, char *fnamep, struct vnode **nvpp, struct nfsmount **newnmpp, struct nfsmount *curnmp, int *ippos, int *dsdirp) { struct vnode *dvp, *nvp = NULL, **tdvpp; struct mount *mp; struct nfsmount *nmp, *newnmp; struct sockaddr *sad; struct sockaddr_in *sin; struct nfsdevice *ds, *tds, *fndds; struct pnfsdsfile *pf; uint32_t dsdir; int error, fhiszero, fnd, gotone, i, mirrorcnt; ASSERT_VOP_LOCKED(vp, "nfsrv_dsgetsockmnt vp"); *mirrorcntp = 1; tdvpp = dvpp; if (nvpp != NULL) *nvpp = NULL; if (dvpp != NULL) *dvpp = NULL; if (ippos != NULL) *ippos = -1; if (newnmpp != NULL) newnmp = *newnmpp; else newnmp = NULL; mp = vp->v_mount; error = vn_extattr_get(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile", buflenp, buf, p); mirrorcnt = *buflenp / sizeof(*pf); if (error == 0 && (mirrorcnt < 1 || mirrorcnt > NFSDEV_MAXMIRRORS || *buflenp != sizeof(*pf) * mirrorcnt)) error = ENOATTR; pf = (struct pnfsdsfile *)buf; /* If curnmp != NULL, check for a match in the mirror list. */ if (curnmp != NULL && error == 0) { fnd = 0; for (i = 0; i < mirrorcnt; i++, pf++) { sad = (struct sockaddr *)&pf->dsf_sin; if (nfsaddr2_match(sad, curnmp->nm_nam)) { if (ippos != NULL) *ippos = i; fnd = 1; break; } } if (fnd == 0) error = ENXIO; } gotone = 0; pf = (struct pnfsdsfile *)buf; NFSD_DEBUG(4, "nfsrv_dsgetsockmnt: mirrorcnt=%d err=%d\n", mirrorcnt, error); for (i = 0; i < mirrorcnt && error == 0; i++, pf++) { fhiszero = 0; sad = (struct sockaddr *)&pf->dsf_sin; sin = &pf->dsf_sin; dsdir = pf->dsf_dir; if (dsdir >= nfsrv_dsdirsize) { printf("nfsrv_dsgetsockmnt: dsdir=%d\n", dsdir); error = ENOATTR; } else if (nvpp != NULL && newnmp != NULL && nfsaddr2_match(sad, newnmp->nm_nam)) error = EEXIST; if (error == 0) { if (ippos != NULL && curnmp == NULL && sad->sa_family == AF_INET && sin->sin_addr.s_addr == 0) *ippos = i; if (NFSBCMP(&zerofh, &pf->dsf_fh, sizeof(zerofh)) == 0) fhiszero = 1; /* Use the socket address to find the mount point. */ fndds = NULL; NFSDDSLOCK(); /* Find a match for the IP address. */ TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) { if (ds->nfsdev_nmp != NULL) { dvp = ds->nfsdev_dvp; nmp = VFSTONFS(dvp->v_mount); if (nmp != ds->nfsdev_nmp) printf("different2 nmp %p %p\n", nmp, ds->nfsdev_nmp); if (nfsaddr2_match(sad, nmp->nm_nam)) { fndds = ds; break; } } } if (fndds != NULL && newnmpp != NULL && newnmp == NULL) { /* Search for a place to make a mirror copy. */ TAILQ_FOREACH(tds, &nfsrv_devidhead, nfsdev_list) { if (tds->nfsdev_nmp != NULL && fndds != tds && ((tds->nfsdev_mdsisset == 0 && fndds->nfsdev_mdsisset == 0) || (tds->nfsdev_mdsisset != 0 && fndds->nfsdev_mdsisset != 0 && fsidcmp(&tds->nfsdev_mdsfsid, &mp->mnt_stat.f_fsid) == 0))) { *newnmpp = tds->nfsdev_nmp; break; } } if (tds != NULL) { /* * Move this entry to the end of the * list, so it won't be selected as * easily the next time. */ TAILQ_REMOVE(&nfsrv_devidhead, tds, nfsdev_list); TAILQ_INSERT_TAIL(&nfsrv_devidhead, tds, nfsdev_list); } } NFSDDSUNLOCK(); if (fndds != NULL) { dvp = fndds->nfsdev_dsdir[dsdir]; if (lktype != 0 || fhiszero != 0 || (nvpp != NULL && *nvpp == NULL)) { if (fhiszero != 0) error = vn_lock(dvp, LK_EXCLUSIVE); else if (lktype != 0) error = vn_lock(dvp, lktype); else error = vn_lock(dvp, LK_SHARED); /* * If the file handle is all 0's, try to * do a Lookup against the DS to acquire * it. * If dvpp == NULL or the Lookup fails, * unlock dvp after the call. */ if (error == 0 && (fhiszero != 0 || (nvpp != NULL && *nvpp == NULL))) { error = nfsrv_pnfslookupds(vp, dvp, pf, &nvp, p); if (error == 0) { if (fhiszero != 0) nfsrv_pnfssetfh( vp, pf, devid, fnamep, nvp, p); if (nvpp != NULL && *nvpp == NULL) { *nvpp = nvp; *dsdirp = dsdir; } else vput(nvp); } if (error != 0 || lktype == 0) NFSVOPUNLOCK(dvp); } } if (error == 0) { gotone++; NFSD_DEBUG(4, "gotone=%d\n", gotone); if (devid != NULL) { NFSBCOPY(fndds->nfsdev_deviceid, devid, NFSX_V4DEVICEID); devid += NFSX_V4DEVICEID; } if (dvpp != NULL) *tdvpp++ = dvp; if (fhp != NULL) NFSBCOPY(&pf->dsf_fh, fhp++, NFSX_MYFH); if (fnamep != NULL && gotone == 1) strlcpy(fnamep, pf->dsf_filename, sizeof(pf->dsf_filename)); } else NFSD_DEBUG(4, "nfsrv_dsgetsockmnt " "err=%d\n", error); } } } if (error == 0 && gotone == 0) error = ENOENT; NFSD_DEBUG(4, "eo nfsrv_dsgetsockmnt: gotone=%d err=%d\n", gotone, error); if (error == 0) *mirrorcntp = gotone; else { if (gotone > 0 && dvpp != NULL) { /* * If the error didn't occur on the first one and * dvpp != NULL, the one(s) prior to the failure will * have locked dvp's that need to be unlocked. */ for (i = 0; i < gotone; i++) { NFSVOPUNLOCK(*dvpp); *dvpp++ = NULL; } } /* * If it found the vnode to be copied from before a failure, * it needs to be vput()'d. */ if (nvpp != NULL && *nvpp != NULL) { vput(*nvpp); *nvpp = NULL; } } return (error); } /* * Set the extended attribute for the Change attribute. */ static int nfsrv_setextattr(struct vnode *vp, struct nfsvattr *nap, NFSPROC_T *p) { struct pnfsdsattr dsattr; int error; ASSERT_VOP_ELOCKED(vp, "nfsrv_setextattr vp"); dsattr.dsa_filerev = nap->na_filerev; dsattr.dsa_size = nap->na_size; dsattr.dsa_atime = nap->na_atime; dsattr.dsa_mtime = nap->na_mtime; dsattr.dsa_bytes = nap->na_bytes; error = vn_extattr_set(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsattr", sizeof(dsattr), (char *)&dsattr, p); if (error != 0) printf("pNFS: setextattr=%d\n", error); return (error); } static int nfsrv_readdsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, NFSPROC_T *p, struct nfsmount *nmp, struct mbuf **mpp, struct mbuf **mpendp) { uint32_t *tl; struct nfsrv_descript *nd; nfsv4stateid_t st; struct mbuf *m, *m2; int error = 0, retlen, tlen, trimlen; NFSD_DEBUG(4, "in nfsrv_readdsrpc\n"); nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO); *mpp = NULL; /* * Use a stateid where other is an alternating 01010 pattern and * seqid is 0xffffffff. This value is not defined as special by * the RFC and is used by the FreeBSD NFS server to indicate an * MDS->DS proxy operation. */ st.other[0] = 0x55555555; st.other[1] = 0x55555555; st.other[2] = 0x55555555; st.seqid = 0xffffffff; nfscl_reqstart(nd, NFSPROC_READDS, nmp, (u_int8_t *)fhp, sizeof(*fhp), NULL, NULL, 0, 0, cred); nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED * 3); txdr_hyper(off, tl); *(tl + 2) = txdr_unsigned(len); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { free(nd, M_TEMP); return (error); } if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); NFSM_STRSIZ(retlen, len); if (retlen > 0) { /* Trim off the pre-data XDR from the mbuf chain. */ m = nd->nd_mrep; while (m != NULL && m != nd->nd_md) { if (m->m_next == nd->nd_md) { m->m_next = NULL; m_freem(nd->nd_mrep); nd->nd_mrep = m = nd->nd_md; } else m = m->m_next; } if (m == NULL) { printf("nfsrv_readdsrpc: busted mbuf list\n"); error = ENOENT; goto nfsmout; } /* * Now, adjust first mbuf so that any XDR before the * read data is skipped over. */ trimlen = nd->nd_dpos - mtod(m, char *); if (trimlen > 0) { m->m_len -= trimlen; NFSM_DATAP(m, trimlen); } /* * Truncate the mbuf chain at retlen bytes of data, * plus XDR padding that brings the length up to a * multiple of 4. */ tlen = NFSM_RNDUP(retlen); do { if (m->m_len >= tlen) { m->m_len = tlen; tlen = 0; m2 = m->m_next; m->m_next = NULL; m_freem(m2); break; } tlen -= m->m_len; m = m->m_next; } while (m != NULL); if (tlen > 0) { printf("nfsrv_readdsrpc: busted mbuf list\n"); error = ENOENT; goto nfsmout; } *mpp = nd->nd_mrep; *mpendp = m; nd->nd_mrep = NULL; } } else error = nd->nd_repstat; nfsmout: /* If nd->nd_mrep is already NULL, this is a no-op. */ m_freem(nd->nd_mrep); free(nd, M_TEMP); NFSD_DEBUG(4, "nfsrv_readdsrpc error=%d\n", error); return (error); } /* * Do a write RPC on a DS data file, using this structure for the arguments, * so that this function can be executed by a separate kernel process. */ struct nfsrvwritedsdorpc { int done; int inprog; struct task tsk; fhandle_t fh; off_t off; int len; struct nfsmount *nmp; struct ucred *cred; NFSPROC_T *p; struct mbuf *m; int err; }; static int nfsrv_writedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off, int len, struct nfsvattr *nap, struct mbuf *m, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript *nd; nfsattrbit_t attrbits; nfsv4stateid_t st; int commit, error, retlen; nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO); nfscl_reqstart(nd, NFSPROC_WRITE, nmp, (u_int8_t *)fhp, sizeof(fhandle_t), NULL, NULL, 0, 0, cred); /* * Use a stateid where other is an alternating 01010 pattern and * seqid is 0xffffffff. This value is not defined as special by * the RFC and is used by the FreeBSD NFS server to indicate an * MDS->DS proxy operation. */ st.other[0] = 0x55555555; st.other[1] = 0x55555555; st.other[2] = 0x55555555; st.seqid = 0xffffffff; nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); txdr_hyper(off, tl); tl += 2; /* * Do all writes FileSync, since the server doesn't hold onto dirty * buffers. Since clients should be accessing the DS servers directly * using the pNFS layouts, this just needs to work correctly as a * fallback. */ *tl++ = txdr_unsigned(NFSWRITE_FILESYNC); *tl = txdr_unsigned(len); NFSD_DEBUG(4, "nfsrv_writedsdorpc: len=%d\n", len); /* Put data in mbuf chain. */ nd->nd_mb->m_next = m; /* Set nd_mb and nd_bpos to end of data. */ while (m->m_next != NULL) m = m->m_next; nd->nd_mb = m; nfsm_set(nd, m->m_len); NFSD_DEBUG(4, "nfsrv_writedsdorpc: lastmb len=%d\n", m->m_len); /* Do a Getattr for the attributes that change upon writing. */ NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESS); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SPACEUSED); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { free(nd, M_TEMP); return (error); } NFSD_DEBUG(4, "nfsrv_writedsdorpc: aft writerpc=%d\n", nd->nd_repstat); /* Get rid of weak cache consistency data for now. */ if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) == (ND_NFSV4 | ND_V4WCCATTR)) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); NFSD_DEBUG(4, "nfsrv_writedsdorpc: wcc attr=%d\n", error); if (error != 0) goto nfsmout; /* * Get rid of Op# and status for next op. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) nd->nd_flag |= ND_NOMOREDATA; } if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); retlen = fxdr_unsigned(int, *tl++); commit = fxdr_unsigned(int, *tl); if (commit != NFSWRITE_FILESYNC) error = NFSERR_IO; NFSD_DEBUG(4, "nfsrv_writedsdorpc:retlen=%d commit=%d err=%d\n", retlen, commit, error); } else error = nd->nd_repstat; /* We have no use for the Write Verifier since we use FileSync. */ /* * Get the Change, Size, Access Time and Modify Time attributes and set * on the Metadata file, so its attributes will be what the file's * would be if it had been written. */ if (error == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); } NFSD_DEBUG(4, "nfsrv_writedsdorpc: aft loadattr=%d\n", error); nfsmout: m_freem(nd->nd_mrep); free(nd, M_TEMP); NFSD_DEBUG(4, "nfsrv_writedsdorpc error=%d\n", error); return (error); } /* * Start up the thread that will execute nfsrv_writedsdorpc(). */ static void start_writedsdorpc(void *arg, int pending) { struct nfsrvwritedsdorpc *drpc; drpc = (struct nfsrvwritedsdorpc *)arg; drpc->err = nfsrv_writedsdorpc(drpc->nmp, &drpc->fh, drpc->off, drpc->len, NULL, drpc->m, drpc->cred, drpc->p); drpc->done = 1; NFSD_DEBUG(4, "start_writedsdorpc: err=%d\n", drpc->err); } static int nfsrv_writedsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt, struct mbuf **mpp, char *cp, int *failposp) { struct nfsrvwritedsdorpc *drpc, *tdrpc = NULL; struct nfsvattr na; struct mbuf *m; int error, i, offs, ret, timo; NFSD_DEBUG(4, "in nfsrv_writedsrpc\n"); KASSERT(*mpp != NULL, ("nfsrv_writedsrpc: NULL mbuf chain")); drpc = NULL; if (mirrorcnt > 1) tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP, M_WAITOK); /* Calculate offset in mbuf chain that data starts. */ offs = cp - mtod(*mpp, char *); NFSD_DEBUG(4, "nfsrv_writedsrpc: mcopy offs=%d len=%d\n", offs, len); /* * Do the write RPC for every DS, using a separate kernel process * for every DS except the last one. */ error = 0; for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { tdrpc->done = 0; NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp)); tdrpc->off = off; tdrpc->len = len; tdrpc->nmp = *nmpp; tdrpc->cred = cred; tdrpc->p = p; tdrpc->inprog = 0; tdrpc->err = 0; tdrpc->m = m_copym(*mpp, offs, NFSM_RNDUP(len), M_WAITOK); ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_writedsdorpc, tdrpc); NFSD_DEBUG(4, "nfsrv_writedsrpc: nfs_pnfsio=%d\n", ret); } if (ret != 0) { ret = nfsrv_writedsdorpc(*nmpp, fhp, off, len, NULL, tdrpc->m, cred, p); if (nfsds_failerr(ret) && *failposp == -1) *failposp = i; else if (error == 0 && ret != 0) error = ret; } nmpp++; fhp++; } m = m_copym(*mpp, offs, NFSM_RNDUP(len), M_WAITOK); ret = nfsrv_writedsdorpc(*nmpp, fhp, off, len, &na, m, cred, p); if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1) *failposp = mirrorcnt - 1; else if (error == 0 && ret != 0) error = ret; if (error == 0) error = nfsrv_setextattr(vp, &na, p); NFSD_DEBUG(4, "nfsrv_writedsrpc: aft setextat=%d\n", error); tdrpc = drpc; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { /* Wait for RPCs on separate threads to complete. */ while (tdrpc->inprog != 0 && tdrpc->done == 0) tsleep(&tdrpc->tsk, PVFS, "srvwrds", timo); if (nfsds_failerr(tdrpc->err) && *failposp == -1) *failposp = i; else if (error == 0 && tdrpc->err != 0) error = tdrpc->err; } free(drpc, M_TEMP); return (error); } /* * Do a allocate RPC on a DS data file, using this structure for the arguments, * so that this function can be executed by a separate kernel process. */ struct nfsrvallocatedsdorpc { int done; int inprog; struct task tsk; fhandle_t fh; off_t off; off_t len; struct nfsmount *nmp; struct ucred *cred; NFSPROC_T *p; int err; }; static int nfsrv_allocatedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off, off_t len, struct nfsvattr *nap, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript *nd; nfsattrbit_t attrbits; nfsv4stateid_t st; int error; nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO); nfscl_reqstart(nd, NFSPROC_ALLOCATE, nmp, (u_int8_t *)fhp, sizeof(fhandle_t), NULL, NULL, 0, 0, cred); /* * Use a stateid where other is an alternating 01010 pattern and * seqid is 0xffffffff. This value is not defined as special by * the RFC and is used by the FreeBSD NFS server to indicate an * MDS->DS proxy operation. */ st.other[0] = 0x55555555; st.other[1] = 0x55555555; st.other[2] = 0x55555555; st.seqid = 0xffffffff; nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_UNSIGNED); txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); tl += 2; NFSD_DEBUG(4, "nfsrv_allocatedsdorpc: len=%jd\n", (intmax_t)len); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { free(nd, M_TEMP); return (error); } NFSD_DEBUG(4, "nfsrv_allocatedsdorpc: aft allocaterpc=%d\n", nd->nd_repstat); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); } else error = nd->nd_repstat; NFSD_DEBUG(4, "nfsrv_allocatedsdorpc: aft loadattr=%d\n", error); nfsmout: m_freem(nd->nd_mrep); free(nd, M_TEMP); NFSD_DEBUG(4, "nfsrv_allocatedsdorpc error=%d\n", error); return (error); } /* * Start up the thread that will execute nfsrv_allocatedsdorpc(). */ static void start_allocatedsdorpc(void *arg, int pending) { struct nfsrvallocatedsdorpc *drpc; drpc = (struct nfsrvallocatedsdorpc *)arg; drpc->err = nfsrv_allocatedsdorpc(drpc->nmp, &drpc->fh, drpc->off, drpc->len, NULL, drpc->cred, drpc->p); drpc->done = 1; NFSD_DEBUG(4, "start_allocatedsdorpc: err=%d\n", drpc->err); } static int nfsrv_allocatedsrpc(fhandle_t *fhp, off_t off, off_t len, struct ucred *cred, NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt, int *failposp) { struct nfsrvallocatedsdorpc *drpc, *tdrpc = NULL; struct nfsvattr na; int error, i, ret, timo; NFSD_DEBUG(4, "in nfsrv_allocatedsrpc\n"); drpc = NULL; if (mirrorcnt > 1) tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP, M_WAITOK); /* * Do the allocate RPC for every DS, using a separate kernel process * for every DS except the last one. */ error = 0; for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { tdrpc->done = 0; NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp)); tdrpc->off = off; tdrpc->len = len; tdrpc->nmp = *nmpp; tdrpc->cred = cred; tdrpc->p = p; tdrpc->inprog = 0; tdrpc->err = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_allocatedsdorpc, tdrpc); NFSD_DEBUG(4, "nfsrv_allocatedsrpc: nfs_pnfsio=%d\n", ret); } if (ret != 0) { ret = nfsrv_allocatedsdorpc(*nmpp, fhp, off, len, NULL, cred, p); if (nfsds_failerr(ret) && *failposp == -1) *failposp = i; else if (error == 0 && ret != 0) error = ret; } nmpp++; fhp++; } ret = nfsrv_allocatedsdorpc(*nmpp, fhp, off, len, &na, cred, p); if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1) *failposp = mirrorcnt - 1; else if (error == 0 && ret != 0) error = ret; if (error == 0) error = nfsrv_setextattr(vp, &na, p); NFSD_DEBUG(4, "nfsrv_allocatedsrpc: aft setextat=%d\n", error); tdrpc = drpc; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { /* Wait for RPCs on separate threads to complete. */ while (tdrpc->inprog != 0 && tdrpc->done == 0) tsleep(&tdrpc->tsk, PVFS, "srvalds", timo); if (nfsds_failerr(tdrpc->err) && *failposp == -1) *failposp = i; else if (error == 0 && tdrpc->err != 0) error = tdrpc->err; } free(drpc, M_TEMP); return (error); } /* * Do a deallocate RPC on a DS data file, using this structure for the * arguments, so that this function can be executed by a separate kernel * process. */ struct nfsrvdeallocatedsdorpc { int done; int inprog; struct task tsk; fhandle_t fh; off_t off; off_t len; struct nfsmount *nmp; struct ucred *cred; NFSPROC_T *p; int err; }; static int nfsrv_deallocatedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off, off_t len, struct nfsvattr *nap, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript *nd; nfsattrbit_t attrbits; nfsv4stateid_t st; int error; nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO); nfscl_reqstart(nd, NFSPROC_DEALLOCATE, nmp, (u_int8_t *)fhp, sizeof(fhandle_t), NULL, NULL, 0, 0, cred); /* * Use a stateid where other is an alternating 01010 pattern and * seqid is 0xffffffff. This value is not defined as special by * the RFC and is used by the FreeBSD NFS server to indicate an * MDS->DS proxy operation. */ st.other[0] = 0x55555555; st.other[1] = 0x55555555; st.other[2] = 0x55555555; st.seqid = 0xffffffff; nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_UNSIGNED); txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); tl += 2; NFSD_DEBUG(4, "nfsrv_deallocatedsdorpc: len=%jd\n", (intmax_t)len); /* Do a Getattr for the attributes that change upon writing. */ NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESS); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SPACEUSED); *tl = txdr_unsigned(NFSV4OP_GETATTR); nfsrv_putattrbit(nd, &attrbits); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { free(nd, M_TEMP); return (error); } NFSD_DEBUG(4, "nfsrv_deallocatedsdorpc: aft deallocaterpc=%d\n", nd->nd_repstat); /* Get rid of weak cache consistency data for now. */ if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) == (ND_NFSV4 | ND_V4WCCATTR)) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); NFSD_DEBUG(4, "nfsrv_deallocatedsdorpc: wcc attr=%d\n", error); if (error != 0) goto nfsmout; /* * Get rid of Op# and status for next op. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) nd->nd_flag |= ND_NOMOREDATA; } if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); } else error = nd->nd_repstat; NFSD_DEBUG(4, "nfsrv_deallocatedsdorpc: aft loadattr=%d\n", error); nfsmout: m_freem(nd->nd_mrep); free(nd, M_TEMP); NFSD_DEBUG(4, "nfsrv_deallocatedsdorpc error=%d\n", error); return (error); } /* * Start up the thread that will execute nfsrv_deallocatedsdorpc(). */ static void start_deallocatedsdorpc(void *arg, int pending) { struct nfsrvdeallocatedsdorpc *drpc; drpc = (struct nfsrvdeallocatedsdorpc *)arg; drpc->err = nfsrv_deallocatedsdorpc(drpc->nmp, &drpc->fh, drpc->off, drpc->len, NULL, drpc->cred, drpc->p); drpc->done = 1; NFSD_DEBUG(4, "start_deallocatedsdorpc: err=%d\n", drpc->err); } static int nfsrv_deallocatedsrpc(fhandle_t *fhp, off_t off, off_t len, struct ucred *cred, NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt, int *failposp) { struct nfsrvdeallocatedsdorpc *drpc, *tdrpc = NULL; struct nfsvattr na; int error, i, ret, timo; NFSD_DEBUG(4, "in nfsrv_deallocatedsrpc\n"); drpc = NULL; if (mirrorcnt > 1) tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP, M_WAITOK); /* * Do the deallocate RPC for every DS, using a separate kernel process * for every DS except the last one. */ error = 0; for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { tdrpc->done = 0; NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp)); tdrpc->off = off; tdrpc->len = len; tdrpc->nmp = *nmpp; tdrpc->cred = cred; tdrpc->p = p; tdrpc->inprog = 0; tdrpc->err = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_deallocatedsdorpc, tdrpc); NFSD_DEBUG(4, "nfsrv_deallocatedsrpc: nfs_pnfsio=%d\n", ret); } if (ret != 0) { ret = nfsrv_deallocatedsdorpc(*nmpp, fhp, off, len, NULL, cred, p); if (nfsds_failerr(ret) && *failposp == -1) *failposp = i; else if (error == 0 && ret != 0) error = ret; } nmpp++; fhp++; } ret = nfsrv_deallocatedsdorpc(*nmpp, fhp, off, len, &na, cred, p); if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1) *failposp = mirrorcnt - 1; else if (error == 0 && ret != 0) error = ret; if (error == 0) error = nfsrv_setextattr(vp, &na, p); NFSD_DEBUG(4, "nfsrv_deallocatedsrpc: aft setextat=%d\n", error); tdrpc = drpc; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { /* Wait for RPCs on separate threads to complete. */ while (tdrpc->inprog != 0 && tdrpc->done == 0) tsleep(&tdrpc->tsk, PVFS, "srvalds", timo); if (nfsds_failerr(tdrpc->err) && *failposp == -1) *failposp = i; else if (error == 0 && tdrpc->err != 0) error = tdrpc->err; } free(drpc, M_TEMP); return (error); } static int nfsrv_setattrdsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, struct vnode *vp, struct nfsmount *nmp, struct nfsvattr *nap, struct nfsvattr *dsnap) { uint32_t *tl; struct nfsrv_descript *nd; nfsv4stateid_t st; nfsattrbit_t attrbits; int error; NFSD_DEBUG(4, "in nfsrv_setattrdsdorpc\n"); nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO); /* * Use a stateid where other is an alternating 01010 pattern and * seqid is 0xffffffff. This value is not defined as special by * the RFC and is used by the FreeBSD NFS server to indicate an * MDS->DS proxy operation. */ st.other[0] = 0x55555555; st.other[1] = 0x55555555; st.other[2] = 0x55555555; st.seqid = 0xffffffff; nfscl_reqstart(nd, NFSPROC_SETATTR, nmp, (u_int8_t *)fhp, sizeof(*fhp), NULL, NULL, 0, 0, cred); nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID); nfscl_fillsattr(nd, &nap->na_vattr, vp, NFSSATTR_FULL, 0); /* Do a Getattr for the attributes that change due to writing. */ NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESS); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SPACEUSED); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { free(nd, M_TEMP); return (error); } NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: aft setattrrpc=%d\n", nd->nd_repstat); /* Get rid of weak cache consistency data for now. */ if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) == (ND_NFSV4 | ND_V4WCCATTR)) { error = nfsv4_loadattr(nd, NULL, dsnap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: wcc attr=%d\n", error); if (error != 0) goto nfsmout; /* * Get rid of Op# and status for next op. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) nd->nd_flag |= ND_NOMOREDATA; } error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error != 0) goto nfsmout; if (nd->nd_repstat != 0) error = nd->nd_repstat; /* * Get the Change, Size, Access Time and Modify Time attributes and set * on the Metadata file, so its attributes will be what the file's * would be if it had been written. */ if (error == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsv4_loadattr(nd, NULL, dsnap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); } NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: aft setattr loadattr=%d\n", error); nfsmout: m_freem(nd->nd_mrep); free(nd, M_TEMP); NFSD_DEBUG(4, "nfsrv_setattrdsdorpc error=%d\n", error); return (error); } struct nfsrvsetattrdsdorpc { int done; int inprog; struct task tsk; fhandle_t fh; struct nfsmount *nmp; struct vnode *vp; struct ucred *cred; NFSPROC_T *p; struct nfsvattr na; struct nfsvattr dsna; int err; }; /* * Start up the thread that will execute nfsrv_setattrdsdorpc(). */ static void start_setattrdsdorpc(void *arg, int pending) { struct nfsrvsetattrdsdorpc *drpc; drpc = (struct nfsrvsetattrdsdorpc *)arg; drpc->err = nfsrv_setattrdsdorpc(&drpc->fh, drpc->cred, drpc->p, drpc->vp, drpc->nmp, &drpc->na, &drpc->dsna); drpc->done = 1; } static int nfsrv_setattrdsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt, struct nfsvattr *nap, int *failposp) { struct nfsrvsetattrdsdorpc *drpc, *tdrpc = NULL; struct nfsvattr na; int error, i, ret, timo; NFSD_DEBUG(4, "in nfsrv_setattrdsrpc\n"); drpc = NULL; if (mirrorcnt > 1) tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP, M_WAITOK); /* * Do the setattr RPC for every DS, using a separate kernel process * for every DS except the last one. */ error = 0; for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { tdrpc->done = 0; tdrpc->inprog = 0; NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp)); tdrpc->nmp = *nmpp; tdrpc->vp = vp; tdrpc->cred = cred; tdrpc->p = p; tdrpc->na = *nap; tdrpc->err = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_setattrdsdorpc, tdrpc); NFSD_DEBUG(4, "nfsrv_setattrdsrpc: nfs_pnfsio=%d\n", ret); } if (ret != 0) { ret = nfsrv_setattrdsdorpc(fhp, cred, p, vp, *nmpp, nap, &na); if (nfsds_failerr(ret) && *failposp == -1) *failposp = i; else if (error == 0 && ret != 0) error = ret; } nmpp++; fhp++; } ret = nfsrv_setattrdsdorpc(fhp, cred, p, vp, *nmpp, nap, &na); if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1) *failposp = mirrorcnt - 1; else if (error == 0 && ret != 0) error = ret; if (error == 0) error = nfsrv_setextattr(vp, &na, p); NFSD_DEBUG(4, "nfsrv_setattrdsrpc: aft setextat=%d\n", error); tdrpc = drpc; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { /* Wait for RPCs on separate threads to complete. */ while (tdrpc->inprog != 0 && tdrpc->done == 0) tsleep(&tdrpc->tsk, PVFS, "srvsads", timo); if (nfsds_failerr(tdrpc->err) && *failposp == -1) *failposp = i; else if (error == 0 && tdrpc->err != 0) error = tdrpc->err; } free(drpc, M_TEMP); return (error); } /* * Do a Setattr of an NFSv4 ACL on the DS file. */ static int nfsrv_setacldsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, struct vnode *vp, struct nfsmount *nmp, struct acl *aclp) { struct nfsrv_descript *nd; nfsv4stateid_t st; nfsattrbit_t attrbits; int error; NFSD_DEBUG(4, "in nfsrv_setacldsdorpc\n"); nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO); /* * Use a stateid where other is an alternating 01010 pattern and * seqid is 0xffffffff. This value is not defined as special by * the RFC and is used by the FreeBSD NFS server to indicate an * MDS->DS proxy operation. */ st.other[0] = 0x55555555; st.other[1] = 0x55555555; st.other[2] = 0x55555555; st.seqid = 0xffffffff; nfscl_reqstart(nd, NFSPROC_SETACL, nmp, (u_int8_t *)fhp, sizeof(*fhp), NULL, NULL, 0, 0, cred); nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL); /* * The "vp" argument to nfsv4_fillattr() is only used for vnode_type(), * so passing in the metadata "vp" will be ok, since it is of * the same type (VREG). */ nfsv4_fillattr(nd, NULL, vp, aclp, NULL, NULL, 0, &attrbits, NULL, NULL, 0, 0, 0, 0, 0, NULL); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { free(nd, M_TEMP); return (error); } NFSD_DEBUG(4, "nfsrv_setacldsdorpc: aft setaclrpc=%d\n", nd->nd_repstat); error = nd->nd_repstat; m_freem(nd->nd_mrep); free(nd, M_TEMP); return (error); } struct nfsrvsetacldsdorpc { int done; int inprog; struct task tsk; fhandle_t fh; struct nfsmount *nmp; struct vnode *vp; struct ucred *cred; NFSPROC_T *p; struct acl *aclp; int err; }; /* * Start up the thread that will execute nfsrv_setacldsdorpc(). */ static void start_setacldsdorpc(void *arg, int pending) { struct nfsrvsetacldsdorpc *drpc; drpc = (struct nfsrvsetacldsdorpc *)arg; drpc->err = nfsrv_setacldsdorpc(&drpc->fh, drpc->cred, drpc->p, drpc->vp, drpc->nmp, drpc->aclp); drpc->done = 1; } static int nfsrv_setacldsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt, struct acl *aclp, int *failposp) { struct nfsrvsetacldsdorpc *drpc, *tdrpc = NULL; int error, i, ret, timo; NFSD_DEBUG(4, "in nfsrv_setacldsrpc\n"); drpc = NULL; if (mirrorcnt > 1) tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP, M_WAITOK); /* * Do the setattr RPC for every DS, using a separate kernel process * for every DS except the last one. */ error = 0; for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { tdrpc->done = 0; tdrpc->inprog = 0; NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp)); tdrpc->nmp = *nmpp; tdrpc->vp = vp; tdrpc->cred = cred; tdrpc->p = p; tdrpc->aclp = aclp; tdrpc->err = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_setacldsdorpc, tdrpc); NFSD_DEBUG(4, "nfsrv_setacldsrpc: nfs_pnfsio=%d\n", ret); } if (ret != 0) { ret = nfsrv_setacldsdorpc(fhp, cred, p, vp, *nmpp, aclp); if (nfsds_failerr(ret) && *failposp == -1) *failposp = i; else if (error == 0 && ret != 0) error = ret; } nmpp++; fhp++; } ret = nfsrv_setacldsdorpc(fhp, cred, p, vp, *nmpp, aclp); if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1) *failposp = mirrorcnt - 1; else if (error == 0 && ret != 0) error = ret; NFSD_DEBUG(4, "nfsrv_setacldsrpc: aft setextat=%d\n", error); tdrpc = drpc; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { /* Wait for RPCs on separate threads to complete. */ while (tdrpc->inprog != 0 && tdrpc->done == 0) tsleep(&tdrpc->tsk, PVFS, "srvacds", timo); if (nfsds_failerr(tdrpc->err) && *failposp == -1) *failposp = i; else if (error == 0 && tdrpc->err != 0) error = tdrpc->err; } free(drpc, M_TEMP); return (error); } /* * Getattr call to the DS for the attributes that change due to writing. */ static int nfsrv_getattrdsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, struct vnode *vp, struct nfsmount *nmp, struct nfsvattr *nap) { struct nfsrv_descript *nd; int error; nfsattrbit_t attrbits; NFSD_DEBUG(4, "in nfsrv_getattrdsrpc\n"); nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO); nfscl_reqstart(nd, NFSPROC_GETATTR, nmp, (u_int8_t *)fhp, sizeof(fhandle_t), NULL, NULL, 0, 0, cred); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESS); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SPACEUSED); (void) nfsrv_putattrbit(nd, &attrbits); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { free(nd, M_TEMP); return (error); } NFSD_DEBUG(4, "nfsrv_getattrdsrpc: aft getattrrpc=%d\n", nd->nd_repstat); if (nd->nd_repstat == 0) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); /* * We can only save the updated values in the extended * attribute if the vp is exclusively locked. * This should happen when any of the following operations * occur on the vnode: * Close, Delegreturn, LayoutCommit, LayoutReturn * As such, the updated extended attribute should get saved * before nfsrv_checkdsattr() returns 0 and allows the cached * attributes to be returned without calling this function. */ if (error == 0 && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { error = nfsrv_setextattr(vp, nap, p); NFSD_DEBUG(4, "nfsrv_getattrdsrpc: aft setextat=%d\n", error); } } else error = nd->nd_repstat; m_freem(nd->nd_mrep); free(nd, M_TEMP); NFSD_DEBUG(4, "nfsrv_getattrdsrpc error=%d\n", error); return (error); } /* * Seek call to a DS. */ static int nfsrv_seekdsrpc(fhandle_t *fhp, off_t *offp, int content, bool *eofp, struct ucred *cred, NFSPROC_T *p, struct nfsmount *nmp) { uint32_t *tl; struct nfsrv_descript *nd; nfsv4stateid_t st; int error; NFSD_DEBUG(4, "in nfsrv_seekdsrpc\n"); /* * Use a stateid where other is an alternating 01010 pattern and * seqid is 0xffffffff. This value is not defined as special by * the RFC and is used by the FreeBSD NFS server to indicate an * MDS->DS proxy operation. */ st.other[0] = 0x55555555; st.other[1] = 0x55555555; st.other[2] = 0x55555555; st.seqid = 0xffffffff; nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO); nfscl_reqstart(nd, NFSPROC_SEEKDS, nmp, (u_int8_t *)fhp, sizeof(fhandle_t), NULL, NULL, 0, 0, cred); nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED); txdr_hyper(*offp, tl); tl += 2; *tl = txdr_unsigned(content); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) { free(nd, M_TEMP); return (error); } NFSD_DEBUG(4, "nfsrv_seekdsrpc: aft seekrpc=%d\n", nd->nd_repstat); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED + NFSX_HYPER); if (*tl++ == newnfs_true) *eofp = true; else *eofp = false; *offp = fxdr_hyper(tl); } else error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); free(nd, M_TEMP); NFSD_DEBUG(4, "nfsrv_seekdsrpc error=%d\n", error); return (error); } /* * Get the device id and file handle for a DS file. */ int nfsrv_dsgetdevandfh(struct vnode *vp, NFSPROC_T *p, int *mirrorcntp, fhandle_t *fhp, char *devid) { int buflen, error; char *buf; buflen = 1024; buf = malloc(buflen, M_TEMP, M_WAITOK); error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, mirrorcntp, p, NULL, fhp, devid, NULL, NULL, NULL, NULL, NULL, NULL); free(buf, M_TEMP); return (error); } /* * Do a Lookup against the DS for the filename. */ static int nfsrv_pnfslookupds(struct vnode *vp, struct vnode *dvp, struct pnfsdsfile *pf, struct vnode **nvpp, NFSPROC_T *p) { struct nameidata named; struct ucred *tcred; char *bufp; u_long *hashp; struct vnode *nvp; int error; tcred = newnfs_getcred(); named.ni_cnd.cn_nameiop = LOOKUP; named.ni_cnd.cn_lkflags = LK_SHARED | LK_RETRY; named.ni_cnd.cn_cred = tcred; - named.ni_cnd.cn_flags = ISLASTCN | LOCKPARENT | LOCKLEAF | SAVENAME; + named.ni_cnd.cn_flags = ISLASTCN | LOCKPARENT | LOCKLEAF; nfsvno_setpathbuf(&named, &bufp, &hashp); named.ni_cnd.cn_nameptr = bufp; named.ni_cnd.cn_namelen = strlen(pf->dsf_filename); strlcpy(bufp, pf->dsf_filename, NAME_MAX); NFSD_DEBUG(4, "nfsrv_pnfslookupds: filename=%s\n", bufp); error = VOP_LOOKUP(dvp, &nvp, &named.ni_cnd); NFSD_DEBUG(4, "nfsrv_pnfslookupds: aft LOOKUP=%d\n", error); NFSFREECRED(tcred); nfsvno_relpathbuf(&named); if (error == 0) *nvpp = nvp; NFSD_DEBUG(4, "eo nfsrv_pnfslookupds=%d\n", error); return (error); } /* * Set the file handle to the correct one. */ static void nfsrv_pnfssetfh(struct vnode *vp, struct pnfsdsfile *pf, char *devid, char *fnamep, struct vnode *nvp, NFSPROC_T *p) { struct nfsnode *np; int ret = 0; np = VTONFS(nvp); NFSBCOPY(np->n_fhp->nfh_fh, &pf->dsf_fh, NFSX_MYFH); /* * We can only do a vn_set_extattr() if the vnode is exclusively * locked and vn_start_write() has been done. If devid != NULL or * fnamep != NULL or the vnode is shared locked, vn_start_write() * may not have been done. * If not done now, it will be done on a future call. */ if (devid == NULL && fnamep == NULL && NFSVOPISLOCKED(vp) == LK_EXCLUSIVE) ret = vn_extattr_set(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile", sizeof(*pf), (char *)pf, p); NFSD_DEBUG(4, "eo nfsrv_pnfssetfh=%d\n", ret); } /* * Cause RPCs waiting on "nmp" to fail. This is called for a DS mount point * when the DS has failed. */ void nfsrv_killrpcs(struct nfsmount *nmp) { /* * Call newnfs_nmcancelreqs() to cause * any RPCs in progress on the mount point to * fail. * This will cause any process waiting for an * RPC to complete while holding a vnode lock * on the mounted-on vnode (such as "df" or * a non-forced "umount") to fail. * This will unlock the mounted-on vnode so * a forced dismount can succeed. * The NFSMNTP_CANCELRPCS flag should be set when this function is * called. */ newnfs_nmcancelreqs(nmp); } /* * Sum up the statfs info for each of the DSs, so that the client will * receive the total for all DSs. */ static int nfsrv_pnfsstatfs(struct statfs *sf, struct mount *mp) { struct statfs *tsf; struct nfsdevice *ds; struct vnode **dvpp, **tdvpp, *dvp; uint64_t tot; int cnt, error = 0, i; if (nfsrv_devidcnt <= 0) return (ENXIO); dvpp = mallocarray(nfsrv_devidcnt, sizeof(*dvpp), M_TEMP, M_WAITOK); tsf = malloc(sizeof(*tsf), M_TEMP, M_WAITOK); /* Get an array of the dvps for the DSs. */ tdvpp = dvpp; i = 0; NFSDDSLOCK(); /* First, search for matches for same file system. */ TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) { if (ds->nfsdev_nmp != NULL && ds->nfsdev_mdsisset != 0 && fsidcmp(&ds->nfsdev_mdsfsid, &mp->mnt_stat.f_fsid) == 0) { if (++i > nfsrv_devidcnt) break; *tdvpp++ = ds->nfsdev_dvp; } } /* * If no matches for same file system, total all servers not assigned * to a file system. */ if (i == 0) { TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) { if (ds->nfsdev_nmp != NULL && ds->nfsdev_mdsisset == 0) { if (++i > nfsrv_devidcnt) break; *tdvpp++ = ds->nfsdev_dvp; } } } NFSDDSUNLOCK(); cnt = i; /* Do a VFS_STATFS() for each of the DSs and sum them up. */ tdvpp = dvpp; for (i = 0; i < cnt && error == 0; i++) { dvp = *tdvpp++; error = VFS_STATFS(dvp->v_mount, tsf); if (error == 0) { if (sf->f_bsize == 0) { if (tsf->f_bsize > 0) sf->f_bsize = tsf->f_bsize; else sf->f_bsize = 8192; } if (tsf->f_blocks > 0) { if (sf->f_bsize != tsf->f_bsize) { tot = tsf->f_blocks * tsf->f_bsize; sf->f_blocks += (tot / sf->f_bsize); } else sf->f_blocks += tsf->f_blocks; } if (tsf->f_bfree > 0) { if (sf->f_bsize != tsf->f_bsize) { tot = tsf->f_bfree * tsf->f_bsize; sf->f_bfree += (tot / sf->f_bsize); } else sf->f_bfree += tsf->f_bfree; } if (tsf->f_bavail > 0) { if (sf->f_bsize != tsf->f_bsize) { tot = tsf->f_bavail * tsf->f_bsize; sf->f_bavail += (tot / sf->f_bsize); } else sf->f_bavail += tsf->f_bavail; } } } free(tsf, M_TEMP); free(dvpp, M_TEMP); return (error); } /* * Set an NFSv4 acl. */ int nfsrv_setacl(struct vnode *vp, NFSACL_T *aclp, struct ucred *cred, NFSPROC_T *p) { int error; if (nfsrv_useacl == 0 || nfs_supportsnfsv4acls(vp) == 0) { error = NFSERR_ATTRNOTSUPP; goto out; } /* * With NFSv4 ACLs, chmod(2) may need to add additional entries. * Make sure it has enough room for that - splitting every entry * into two and appending "canonical six" entries at the end. * Cribbed out of kern/vfs_acl.c - Rick M. */ if (aclp->acl_cnt > (ACL_MAX_ENTRIES - 6) / 2) { error = NFSERR_ATTRNOTSUPP; goto out; } error = VOP_SETACL(vp, ACL_TYPE_NFS4, aclp, cred, p); if (error == 0) { error = nfsrv_dssetacl(vp, aclp, cred, p); if (error == ENOENT) error = 0; } out: NFSEXITCODE(error); return (error); } /* * Seek vnode op call (actually it is a VOP_IOCTL()). * This function is called with the vnode locked, but unlocks and vrele()s * the vp before returning. */ int nfsvno_seek(struct nfsrv_descript *nd, struct vnode *vp, u_long cmd, off_t *offp, int content, bool *eofp, struct ucred *cred, NFSPROC_T *p) { struct nfsvattr at; int error, ret; ASSERT_VOP_LOCKED(vp, "nfsvno_seek vp"); /* * Attempt to seek on a DS file. A return of ENOENT implies * there is no DS file to seek on. */ error = nfsrv_proxyds(vp, 0, 0, cred, p, NFSPROC_SEEKDS, NULL, NULL, NULL, NULL, NULL, offp, content, eofp); if (error != ENOENT) { vput(vp); return (error); } /* * Do the VOP_IOCTL() call. For the case where *offp == file_size, * VOP_IOCTL() will return ENXIO. However, the correct reply for * NFSv4.2 is *eofp == true and error == 0 for this case. */ NFSVOPUNLOCK(vp); error = VOP_IOCTL(vp, cmd, offp, 0, cred, p); *eofp = false; if (error == ENXIO || (error == 0 && cmd == FIOSEEKHOLE)) { /* Handle the cases where we might be at EOF. */ ret = nfsvno_getattr(vp, &at, nd, p, 0, NULL); if (ret == 0 && *offp == at.na_size) { *eofp = true; error = 0; } if (ret != 0 && error == 0) error = ret; } vrele(vp); NFSEXITCODE(error); return (error); } /* * Allocate vnode op call. */ int nfsvno_allocate(struct vnode *vp, off_t off, off_t len, struct ucred *cred, NFSPROC_T *p) { int error; off_t olen; ASSERT_VOP_ELOCKED(vp, "nfsvno_allocate vp"); /* * Attempt to allocate on a DS file. A return of ENOENT implies * there is no DS file to allocate on. */ error = nfsrv_proxyds(vp, off, 0, cred, p, NFSPROC_ALLOCATE, NULL, NULL, NULL, NULL, NULL, &len, 0, NULL); if (error != ENOENT) return (error); /* * Do the actual VOP_ALLOCATE(), looping so long as * progress is being made, to achieve completion. */ do { olen = len; error = VOP_ALLOCATE(vp, &off, &len, IO_SYNC, cred); if (error == 0 && len > 0 && olen > len) maybe_yield(); } while (error == 0 && len > 0 && olen > len); if (error == 0 && len > 0) error = NFSERR_IO; NFSEXITCODE(error); return (error); } /* * Deallocate vnode op call. */ int nfsvno_deallocate(struct vnode *vp, off_t off, off_t len, struct ucred *cred, NFSPROC_T *p) { int error; off_t olen; ASSERT_VOP_ELOCKED(vp, "nfsvno_deallocate vp"); /* * Attempt to deallocate on a DS file. A return of ENOENT implies * there is no DS file to deallocate on. */ error = nfsrv_proxyds(vp, off, 0, cred, p, NFSPROC_DEALLOCATE, NULL, NULL, NULL, NULL, NULL, &len, 0, NULL); if (error != ENOENT) return (error); /* * Do the actual VOP_DEALLOCATE(), looping so long as * progress is being made, to achieve completion. */ do { olen = len; error = VOP_DEALLOCATE(vp, &off, &len, 0, IO_SYNC, cred); if (error == 0 && len > 0 && olen > len) maybe_yield(); } while (error == 0 && len > 0 && olen > len); if (error == 0 && len > 0) error = NFSERR_IO; NFSEXITCODE(error); return (error); } /* * Get Extended Atribute vnode op into an mbuf list. */ int nfsvno_getxattr(struct vnode *vp, char *name, uint32_t maxresp, struct ucred *cred, uint64_t flag, int maxextsiz, struct thread *p, struct mbuf **mpp, struct mbuf **mpendp, int *lenp) { struct iovec *iv; struct uio io, *uiop = &io; struct mbuf *m, *m2; int alen, error, len, tlen; size_t siz; /* First, find out the size of the extended attribute. */ error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, NULL, &siz, cred, p); if (error != 0) return (NFSERR_NOXATTR); if (siz > maxresp - NFS_MAXXDR) return (NFSERR_XATTR2BIG); len = siz; tlen = NFSM_RNDUP(len); if (tlen > 0) { /* * If cnt > MCLBYTES and the reply will not be saved, use * ext_pgs mbufs for TLS. * For NFSv4.0, we do not know for sure if the reply will * be saved, so do not use ext_pgs mbufs for NFSv4.0. * Always use ext_pgs mbufs if ND_EXTPG is set. */ if ((flag & ND_EXTPG) != 0 || (tlen > MCLBYTES && (flag & (ND_TLS | ND_SAVEREPLY)) == ND_TLS && (flag & (ND_NFSV4 | ND_NFSV41)) != ND_NFSV4)) uiop->uio_iovcnt = nfsrv_createiovec_extpgs(tlen, maxextsiz, &m, &m2, &iv); else uiop->uio_iovcnt = nfsrv_createiovec(tlen, &m, &m2, &iv); uiop->uio_iov = iv; } else { uiop->uio_iovcnt = 0; uiop->uio_iov = iv = NULL; m = m2 = NULL; } uiop->uio_offset = 0; uiop->uio_resid = tlen; uiop->uio_rw = UIO_READ; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = p; #ifdef MAC error = mac_vnode_check_getextattr(cred, vp, EXTATTR_NAMESPACE_USER, name); if (error != 0) goto out; #endif if (tlen > 0) error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, uiop, NULL, cred, p); if (error != 0) goto out; if (uiop->uio_resid > 0) { alen = tlen; len = tlen - uiop->uio_resid; tlen = NFSM_RNDUP(len); if (alen != tlen) printf("nfsvno_getxattr: weird size read\n"); if (tlen == 0) { m_freem(m); m = m2 = NULL; } else if (alen != tlen || tlen != len) m2 = nfsrv_adj(m, alen - tlen, tlen - len); } *lenp = len; *mpp = m; *mpendp = m2; out: if (error != 0) { if (m != NULL) m_freem(m); *lenp = 0; } free(iv, M_TEMP); NFSEXITCODE(error); return (error); } /* * Set Extended attribute vnode op from an mbuf list. */ int nfsvno_setxattr(struct vnode *vp, char *name, int len, struct mbuf *m, char *cp, struct ucred *cred, struct thread *p) { struct iovec *iv; struct uio uio, *uiop = &uio; int cnt, error; error = 0; #ifdef MAC error = mac_vnode_check_setextattr(cred, vp, EXTATTR_NAMESPACE_USER, name); #endif if (error != 0) goto out; uiop->uio_rw = UIO_WRITE; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = p; uiop->uio_offset = 0; uiop->uio_resid = len; if (len > 0) { error = nfsrv_createiovecw(len, m, cp, &iv, &cnt); uiop->uio_iov = iv; uiop->uio_iovcnt = cnt; } else { uiop->uio_iov = iv = NULL; uiop->uio_iovcnt = 0; } if (error == 0) { error = VOP_SETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, uiop, cred, p); free(iv, M_TEMP); } out: NFSEXITCODE(error); return (error); } /* * Remove Extended attribute vnode op. */ int nfsvno_rmxattr(struct nfsrv_descript *nd, struct vnode *vp, char *name, struct ucred *cred, struct thread *p) { int error; /* * Get rid of any delegations. I am not sure why this is required, * but RFC-8276 says so. */ error = nfsrv_checkremove(vp, 0, nd, nd->nd_clientid, p); if (error != 0) goto out; #ifdef MAC error = mac_vnode_check_deleteextattr(cred, vp, EXTATTR_NAMESPACE_USER, name); if (error != 0) goto out; #endif error = VOP_DELETEEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, cred, p); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, NULL, cred, p); out: NFSEXITCODE(error); return (error); } /* * List Extended Atribute vnode op into an mbuf list. */ int nfsvno_listxattr(struct vnode *vp, uint64_t cookie, struct ucred *cred, struct thread *p, u_char **bufp, uint32_t *lenp, bool *eofp) { struct iovec iv; struct uio io; int error; size_t siz; *bufp = NULL; /* First, find out the size of the extended attribute. */ error = VOP_LISTEXTATTR(vp, EXTATTR_NAMESPACE_USER, NULL, &siz, cred, p); if (error != 0) return (NFSERR_NOXATTR); if (siz <= cookie) { *lenp = 0; *eofp = true; goto out; } if (siz > cookie + *lenp) { siz = cookie + *lenp; *eofp = false; } else *eofp = true; /* Just choose a sanity limit of 10Mbytes for malloc(M_TEMP). */ if (siz > 10 * 1024 * 1024) { error = NFSERR_XATTR2BIG; goto out; } *bufp = malloc(siz, M_TEMP, M_WAITOK); iv.iov_base = *bufp; iv.iov_len = siz; io.uio_iovcnt = 1; io.uio_iov = &iv; io.uio_offset = 0; io.uio_resid = siz; io.uio_rw = UIO_READ; io.uio_segflg = UIO_SYSSPACE; io.uio_td = p; #ifdef MAC error = mac_vnode_check_listextattr(cred, vp, EXTATTR_NAMESPACE_USER); if (error != 0) goto out; #endif error = VOP_LISTEXTATTR(vp, EXTATTR_NAMESPACE_USER, &io, NULL, cred, p); if (error != 0) goto out; if (io.uio_resid > 0) siz -= io.uio_resid; *lenp = siz; out: if (error != 0) { free(*bufp, M_TEMP); *bufp = NULL; } NFSEXITCODE(error); return (error); } /* * Trim trailing data off the mbuf list being built. */ void nfsm_trimtrailing(struct nfsrv_descript *nd, struct mbuf *mb, char *bpos, int bextpg, int bextpgsiz) { vm_page_t pg; int fullpgsiz, i; if (mb->m_next != NULL) { m_freem(mb->m_next); mb->m_next = NULL; } if ((mb->m_flags & M_EXTPG) != 0) { KASSERT(bextpg >= 0 && bextpg < mb->m_epg_npgs, ("nfsm_trimtrailing: bextpg out of range")); KASSERT(bpos == (char *)(void *) PHYS_TO_DMAP(mb->m_epg_pa[bextpg]) + PAGE_SIZE - bextpgsiz, ("nfsm_trimtrailing: bextpgsiz bad!")); /* First, get rid of any pages after this position. */ for (i = mb->m_epg_npgs - 1; i > bextpg; i--) { pg = PHYS_TO_VM_PAGE(mb->m_epg_pa[i]); vm_page_unwire_noq(pg); vm_page_free(pg); } mb->m_epg_npgs = bextpg + 1; if (bextpg == 0) fullpgsiz = PAGE_SIZE - mb->m_epg_1st_off; else fullpgsiz = PAGE_SIZE; mb->m_epg_last_len = fullpgsiz - bextpgsiz; mb->m_len = m_epg_pagelen(mb, 0, mb->m_epg_1st_off); for (i = 1; i < mb->m_epg_npgs; i++) mb->m_len += m_epg_pagelen(mb, i, 0); nd->nd_bextpgsiz = bextpgsiz; nd->nd_bextpg = bextpg; } else mb->m_len = bpos - mtod(mb, char *); nd->nd_mb = mb; nd->nd_bpos = bpos; } /* * Check to see if a put file handle operation should test for * NFSERR_WRONGSEC, although NFSv3 actually returns NFSERR_AUTHERR. * When Open is the next operation, NFSERR_WRONGSEC cannot be * replied for the Open cases that use a component. This can * be identified by the fact that the file handle's type is VDIR. */ bool nfsrv_checkwrongsec(struct nfsrv_descript *nd, int nextop, enum vtype vtyp) { if ((nd->nd_flag & ND_NFSV4) == 0) return (true); if ((nd->nd_flag & ND_LASTOP) != 0) return (false); if (nextop == NFSV4OP_PUTROOTFH || nextop == NFSV4OP_PUTFH || nextop == NFSV4OP_PUTPUBFH || nextop == NFSV4OP_RESTOREFH || nextop == NFSV4OP_LOOKUP || nextop == NFSV4OP_LOOKUPP || nextop == NFSV4OP_SECINFO || nextop == NFSV4OP_SECINFONONAME) return (false); if (nextop == NFSV4OP_OPEN && vtyp == VDIR) return (false); return (true); } /* * Check DSs marked no space. */ void nfsrv_checknospc(void) { struct statfs *tsf; struct nfsdevice *ds; struct vnode **dvpp, **tdvpp, *dvp; char *devid, *tdevid; int cnt, error = 0, i; if (nfsrv_devidcnt <= 0) return; dvpp = mallocarray(nfsrv_devidcnt, sizeof(*dvpp), M_TEMP, M_WAITOK); devid = malloc(nfsrv_devidcnt * NFSX_V4DEVICEID, M_TEMP, M_WAITOK); tsf = malloc(sizeof(*tsf), M_TEMP, M_WAITOK); /* Get an array of the dvps for the DSs. */ tdvpp = dvpp; tdevid = devid; i = 0; NFSDDSLOCK(); /* First, search for matches for same file system. */ TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) { if (ds->nfsdev_nmp != NULL && ds->nfsdev_nospc) { if (++i > nfsrv_devidcnt) break; *tdvpp++ = ds->nfsdev_dvp; NFSBCOPY(ds->nfsdev_deviceid, tdevid, NFSX_V4DEVICEID); tdevid += NFSX_V4DEVICEID; } } NFSDDSUNLOCK(); /* Do a VFS_STATFS() for each of the DSs and clear no space. */ cnt = i; tdvpp = dvpp; tdevid = devid; for (i = 0; i < cnt && error == 0; i++) { dvp = *tdvpp++; error = VFS_STATFS(dvp->v_mount, tsf); if (error == 0 && tsf->f_bavail > 0) { NFSD_DEBUG(1, "nfsrv_checknospc: reset nospc\n"); nfsrv_marknospc(tdevid, false); } tdevid += NFSX_V4DEVICEID; } free(tsf, M_TEMP); free(dvpp, M_TEMP); free(devid, M_TEMP); } extern int (*nfsd_call_nfsd)(struct thread *, struct nfssvc_args *); /* * Called once to initialize data structures... */ static int nfsd_modevent(module_t mod, int type, void *data) { int error = 0, i; static int loaded = 0; switch (type) { case MOD_LOAD: if (loaded) goto out; newnfs_portinit(); for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { mtx_init(&nfsrchash_table[i].mtx, "nfsrtc", NULL, MTX_DEF); mtx_init(&nfsrcahash_table[i].mtx, "nfsrtca", NULL, MTX_DEF); } mtx_init(&nfsrc_udpmtx, "nfsuc", NULL, MTX_DEF); mtx_init(&nfs_v4root_mutex, "nfs4rt", NULL, MTX_DEF); mtx_init(&nfsv4root_mnt.mnt_mtx, "nfs4mnt", NULL, MTX_DEF); mtx_init(&nfsrv_dontlistlock_mtx, "nfs4dnl", NULL, MTX_DEF); mtx_init(&nfsrv_recalllock_mtx, "nfs4rec", NULL, MTX_DEF); lockinit(&nfsv4root_mnt.mnt_explock, PVFS, "explock", 0, 0); callout_init(&nfsd_callout, 1); nfsrvd_initcache(); nfsd_init(); NFSD_LOCK(); nfsrvd_init(0); NFSD_UNLOCK(); nfsd_mntinit(); #ifdef VV_DISABLEDELEG vn_deleg_ops.vndeleg_recall = nfsd_recalldelegation; vn_deleg_ops.vndeleg_disable = nfsd_disabledelegation; #endif nfsd_call_nfsd = nfssvc_nfsd; loaded = 1; break; case MOD_UNLOAD: if (newnfs_numnfsd != 0) { error = EBUSY; break; } #ifdef VV_DISABLEDELEG vn_deleg_ops.vndeleg_recall = NULL; vn_deleg_ops.vndeleg_disable = NULL; #endif nfsd_call_nfsd = NULL; callout_drain(&nfsd_callout); /* Clean out all NFSv4 state. */ nfsrv_throwawayallstate(curthread); /* Clean the NFS server reply cache */ nfsrvd_cleancache(); /* Free up the krpc server pool. */ if (nfsrvd_pool != NULL) svcpool_destroy(nfsrvd_pool); /* and get rid of the locks */ for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { mtx_destroy(&nfsrchash_table[i].mtx); mtx_destroy(&nfsrcahash_table[i].mtx); } mtx_destroy(&nfsrc_udpmtx); mtx_destroy(&nfs_v4root_mutex); mtx_destroy(&nfsv4root_mnt.mnt_mtx); mtx_destroy(&nfsrv_dontlistlock_mtx); mtx_destroy(&nfsrv_recalllock_mtx); for (i = 0; i < nfsrv_sessionhashsize; i++) mtx_destroy(&nfssessionhash[i].mtx); if (nfslayouthash != NULL) { for (i = 0; i < nfsrv_layouthashsize; i++) mtx_destroy(&nfslayouthash[i].mtx); free(nfslayouthash, M_NFSDSESSION); } lockdestroy(&nfsv4root_mnt.mnt_explock); free(nfsclienthash, M_NFSDCLIENT); free(nfslockhash, M_NFSDLOCKFILE); free(nfssessionhash, M_NFSDSESSION); loaded = 0; break; default: error = EOPNOTSUPP; break; } out: NFSEXITCODE(error); return (error); } static moduledata_t nfsd_mod = { "nfsd", nfsd_modevent, NULL, }; DECLARE_MODULE(nfsd, nfsd_mod, SI_SUB_VFS, SI_ORDER_ANY); /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(nfsd, 1); MODULE_DEPEND(nfsd, nfscommon, 1, 1, 1); MODULE_DEPEND(nfsd, nfslockd, 1, 1, 1); MODULE_DEPEND(nfsd, krpc, 1, 1, 1); MODULE_DEPEND(nfsd, nfssvc, 1, 1, 1); diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c index db1075596b91..23360f9e3909 100644 --- a/sys/fs/nfsserver/nfs_nfsdserv.c +++ b/sys/fs/nfsserver/nfs_nfsdserv.c @@ -1,6188 +1,6186 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" /* * nfs version 2, 3 and 4 server calls to vnode ops * - these routines generally have 3 phases * 1 - break down and validate rpc request in mbuf list * 2 - do the vnode ops for the request, usually by calling a nfsvno_XXX() * function in nfsd_port.c * 3 - build the rpc reply in an mbuf list * For nfsv4, these functions are called for each Op within the Compound RPC. */ #include #include #include /* Global vars */ extern u_int32_t newnfs_false, newnfs_true; extern enum vtype nv34tov_type[8]; extern struct timeval nfsboottime; extern int nfsrv_enable_crossmntpt; extern int nfsrv_statehashsize; extern int nfsrv_layouthashsize; extern time_t nfsdev_time; extern volatile int nfsrv_devidcnt; extern int nfsd_debuglevel; extern u_long sb_max_adj; extern int nfsrv_pnfsatime; extern int nfsrv_maxpnfsmirror; extern uint32_t nfs_srvmaxio; static int nfs_async = 0; SYSCTL_DECL(_vfs_nfsd); SYSCTL_INT(_vfs_nfsd, OID_AUTO, async, CTLFLAG_RW, &nfs_async, 0, "Tell client that writes were synced even though they were not"); extern int nfsrv_doflexfile; SYSCTL_INT(_vfs_nfsd, OID_AUTO, default_flexfile, CTLFLAG_RW, &nfsrv_doflexfile, 0, "Make Flex File Layout the default for pNFS"); static int nfsrv_linux42server = 1; SYSCTL_INT(_vfs_nfsd, OID_AUTO, linux42server, CTLFLAG_RW, &nfsrv_linux42server, 0, "Enable Linux style NFSv4.2 server (non-RFC compliant)"); static bool nfsrv_openaccess = true; SYSCTL_BOOL(_vfs_nfsd, OID_AUTO, v4openaccess, CTLFLAG_RW, &nfsrv_openaccess, 0, "Enable Linux style NFSv4 Open access check"); static char nfsrv_scope[NFSV4_OPAQUELIMIT]; SYSCTL_STRING(_vfs_nfsd, OID_AUTO, scope, CTLFLAG_RWTUN, &nfsrv_scope, NFSV4_OPAQUELIMIT, "Server scope"); static char nfsrv_owner_major[NFSV4_OPAQUELIMIT]; SYSCTL_STRING(_vfs_nfsd, OID_AUTO, owner_major, CTLFLAG_RWTUN, &nfsrv_owner_major, NFSV4_OPAQUELIMIT, "Server owner major"); static uint64_t nfsrv_owner_minor; SYSCTL_U64(_vfs_nfsd, OID_AUTO, owner_minor, CTLFLAG_RWTUN, &nfsrv_owner_minor, 0, "Server owner minor"); /* * Only enable this if all your exported file systems * (or pNFS DSs for the pNFS case) support VOP_ALLOCATE. */ static bool nfsrv_doallocate = false; SYSCTL_BOOL(_vfs_nfsd, OID_AUTO, enable_v42allocate, CTLFLAG_RW, &nfsrv_doallocate, 0, "Enable NFSv4.2 Allocate operation"); /* * This list defines the GSS mechanisms supported. * (Don't ask me how you get these strings from the RFC stuff like * iso(1), org(3)... but someone did it, so I don't need to know.) */ static struct nfsgss_mechlist nfsgss_mechlist[] = { { 9, "\052\206\110\206\367\022\001\002\002", 11 }, { 0, "", 0 }, }; /* local functions */ static void nfsrvd_symlinksub(struct nfsrv_descript *nd, struct nameidata *ndp, struct nfsvattr *nvap, fhandle_t *fhp, vnode_t *vpp, vnode_t dirp, struct nfsvattr *dirforp, struct nfsvattr *diraftp, int *diraft_retp, nfsattrbit_t *attrbitp, NFSACL_T *aclp, NFSPROC_T *p, struct nfsexstuff *exp, char *pathcp, int pathlen); static void nfsrvd_mkdirsub(struct nfsrv_descript *nd, struct nameidata *ndp, struct nfsvattr *nvap, fhandle_t *fhp, vnode_t *vpp, vnode_t dirp, struct nfsvattr *dirforp, struct nfsvattr *diraftp, int *diraft_retp, nfsattrbit_t *attrbitp, NFSACL_T *aclp, NFSPROC_T *p, struct nfsexstuff *exp); /* * nfs access service (not a part of NFS V2) */ int nfsrvd_access(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { u_int32_t *tl; int getret, error = 0; struct nfsvattr nva; u_int32_t testmode, nfsmode, supported = 0; accmode_t deletebit; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_postopattr(nd, 1, &nva); goto out; } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nfsmode = fxdr_unsigned(u_int32_t, *tl); if ((nd->nd_flag & ND_NFSV4) && (nfsmode & ~(NFSACCESS_READ | NFSACCESS_LOOKUP | NFSACCESS_MODIFY | NFSACCESS_EXTEND | NFSACCESS_DELETE | NFSACCESS_EXECUTE | NFSACCESS_XAREAD | NFSACCESS_XAWRITE | NFSACCESS_XALIST))) { nd->nd_repstat = NFSERR_INVAL; vput(vp); goto out; } if (nfsmode & NFSACCESS_READ) { supported |= NFSACCESS_READ; if (nfsvno_accchk(vp, VREAD, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, &supported)) nfsmode &= ~NFSACCESS_READ; } if (nfsmode & NFSACCESS_MODIFY) { supported |= NFSACCESS_MODIFY; if (nfsvno_accchk(vp, VWRITE, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, &supported)) nfsmode &= ~NFSACCESS_MODIFY; } if (nfsmode & NFSACCESS_EXTEND) { supported |= NFSACCESS_EXTEND; if (nfsvno_accchk(vp, VWRITE | VAPPEND, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, &supported)) nfsmode &= ~NFSACCESS_EXTEND; } if (nfsmode & NFSACCESS_XAREAD) { supported |= NFSACCESS_XAREAD; if (nfsvno_accchk(vp, VREAD, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, &supported)) nfsmode &= ~NFSACCESS_XAREAD; } if (nfsmode & NFSACCESS_XAWRITE) { supported |= NFSACCESS_XAWRITE; if (nfsvno_accchk(vp, VWRITE, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, &supported)) nfsmode &= ~NFSACCESS_XAWRITE; } if (nfsmode & NFSACCESS_XALIST) { supported |= NFSACCESS_XALIST; if (nfsvno_accchk(vp, VREAD, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, &supported)) nfsmode &= ~NFSACCESS_XALIST; } if (nfsmode & NFSACCESS_DELETE) { supported |= NFSACCESS_DELETE; if (vp->v_type == VDIR) deletebit = VDELETE_CHILD; else deletebit = VDELETE; if (nfsvno_accchk(vp, deletebit, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, &supported)) nfsmode &= ~NFSACCESS_DELETE; } if (vp->v_type == VDIR) testmode = NFSACCESS_LOOKUP; else testmode = NFSACCESS_EXECUTE; if (nfsmode & testmode) { supported |= (nfsmode & testmode); if (nfsvno_accchk(vp, VEXEC, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, &supported)) nfsmode &= ~testmode; } nfsmode &= supported; if (nd->nd_flag & ND_NFSV3) { getret = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); nfsrv_postopattr(nd, getret, &nva); } vput(vp); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(supported); } else NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(nfsmode); out: NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfs getattr service */ int nfsrvd_getattr(struct nfsrv_descript *nd, int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { struct nfsvattr nva; fhandle_t fh; int at_root = 0, error = 0, supports_nfsv4acls; struct nfsreferral *refp; nfsattrbit_t attrbits, tmpbits; struct mount *mp; struct vnode *tvp = NULL; struct vattr va; uint64_t mounted_on_fileno = 0; accmode_t accmode; struct thread *p = curthread; if (nd->nd_repstat) goto out; if (nd->nd_flag & ND_NFSV4) { error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error) { vput(vp); goto out; } /* * Check for a referral. */ refp = nfsv4root_getreferral(vp, NULL, 0); if (refp != NULL) { (void) nfsrv_putreferralattr(nd, &attrbits, refp, 1, &nd->nd_repstat); vput(vp); goto out; } if (nd->nd_repstat == 0) { accmode = 0; NFSSET_ATTRBIT(&tmpbits, &attrbits); /* * GETATTR with write-only attr time_access_set and time_modify_set * should return NFS4ERR_INVAL. */ if (NFSISSET_ATTRBIT(&tmpbits, NFSATTRBIT_TIMEACCESSSET) || NFSISSET_ATTRBIT(&tmpbits, NFSATTRBIT_TIMEMODIFYSET)){ error = NFSERR_INVAL; vput(vp); goto out; } if (NFSISSET_ATTRBIT(&tmpbits, NFSATTRBIT_ACL)) { NFSCLRBIT_ATTRBIT(&tmpbits, NFSATTRBIT_ACL); accmode |= VREAD_ACL; } if (NFSNONZERO_ATTRBIT(&tmpbits)) accmode |= VREAD_ATTRIBUTES; if (accmode != 0) nd->nd_repstat = nfsvno_accchk(vp, accmode, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, NULL); } } if (!nd->nd_repstat) nd->nd_repstat = nfsvno_getattr(vp, &nva, nd, p, 1, &attrbits); if (!nd->nd_repstat) { if (nd->nd_flag & ND_NFSV4) { if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_FILEHANDLE)) nd->nd_repstat = nfsvno_getfh(vp, &fh, p); if (!nd->nd_repstat) nd->nd_repstat = nfsrv_checkgetattr(nd, vp, &nva, &attrbits, p); if (nd->nd_repstat == 0) { supports_nfsv4acls = nfs_supportsnfsv4acls(vp); mp = vp->v_mount; if (nfsrv_enable_crossmntpt != 0 && vp->v_type == VDIR && (vp->v_vflag & VV_ROOT) != 0 && vp != rootvnode) { tvp = mp->mnt_vnodecovered; VREF(tvp); at_root = 1; } else at_root = 0; vfs_ref(mp); NFSVOPUNLOCK(vp); if (at_root != 0) { if ((nd->nd_repstat = NFSVOPLOCK(tvp, LK_SHARED)) == 0) { nd->nd_repstat = VOP_GETATTR( tvp, &va, nd->nd_cred); vput(tvp); } else vrele(tvp); if (nd->nd_repstat == 0) mounted_on_fileno = (uint64_t) va.va_fileid; else at_root = 0; } if (nd->nd_repstat == 0) nd->nd_repstat = vfs_busy(mp, 0); vfs_rel(mp); if (nd->nd_repstat == 0) { (void)nfsvno_fillattr(nd, mp, vp, &nva, &fh, 0, &attrbits, nd->nd_cred, p, isdgram, 1, supports_nfsv4acls, at_root, mounted_on_fileno); vfs_unbusy(mp); } vrele(vp); } else vput(vp); } else { nfsrv_fillattr(nd, &nva); vput(vp); } } else { vput(vp); } out: NFSEXITCODE2(error, nd); return (error); } /* * nfs setattr service */ int nfsrvd_setattr(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { struct nfsvattr nva, nva2; u_int32_t *tl; int preat_ret = 1, postat_ret = 1, gcheck = 0, error = 0; int gotproxystateid; struct timespec guard = { 0, 0 }; nfsattrbit_t attrbits, retbits; nfsv4stateid_t stateid; NFSACL_T *aclp = NULL; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_wcc(nd, preat_ret, &nva2, postat_ret, &nva); goto out; } #ifdef NFS4_ACL_EXTATTR_NAME aclp = acl_alloc(M_WAITOK); aclp->acl_cnt = 0; #endif gotproxystateid = 0; NFSVNO_ATTRINIT(&nva); if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); stateid.other[0] = *tl++; stateid.other[1] = *tl++; stateid.other[2] = *tl; if (stateid.other[0] == 0x55555555 && stateid.other[1] == 0x55555555 && stateid.other[2] == 0x55555555 && stateid.seqid == 0xffffffff) gotproxystateid = 1; } error = nfsrv_sattr(nd, vp, &nva, &attrbits, aclp, p); if (error) goto nfsmout; /* For NFSv4, only va_uid is used from nva2. */ NFSZERO_ATTRBIT(&retbits); NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_OWNER); preat_ret = nfsvno_getattr(vp, &nva2, nd, p, 1, &retbits); if (!nd->nd_repstat) nd->nd_repstat = preat_ret; NFSZERO_ATTRBIT(&retbits); if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); gcheck = fxdr_unsigned(int, *tl); if (gcheck) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); fxdr_nfsv3time(tl, &guard); } if (!nd->nd_repstat && gcheck && (nva2.na_ctime.tv_sec != guard.tv_sec || nva2.na_ctime.tv_nsec != guard.tv_nsec)) nd->nd_repstat = NFSERR_NOT_SYNC; if (nd->nd_repstat) { vput(vp); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif nfsrv_wcc(nd, preat_ret, &nva2, postat_ret, &nva); goto out; } } else if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV4)) nd->nd_repstat = nfsrv_checkuidgid(nd, &nva); /* * Now that we have all the fields, lets do it. * If the size is being changed write access is required, otherwise * just check for a read only file system. */ if (!nd->nd_repstat) { if (NFSVNO_NOTSETSIZE(&nva)) { if (NFSVNO_EXRDONLY(exp) || (vp->v_mount->mnt_flag & MNT_RDONLY)) nd->nd_repstat = EROFS; } else { if (vp->v_type != VREG) nd->nd_repstat = EINVAL; else if (nva2.na_uid != nd->nd_cred->cr_uid || NFSVNO_EXSTRICTACCESS(exp)) nd->nd_repstat = nfsvno_accchk(vp, VWRITE, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, NULL); } } /* * Proxy operations from the MDS are allowed via the all 0s special * stateid. */ if (nd->nd_repstat == 0 && (nd->nd_flag & ND_NFSV4) != 0 && gotproxystateid == 0) nd->nd_repstat = nfsrv_checksetattr(vp, nd, &stateid, &nva, &attrbits, exp, p); if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV4)) { /* * For V4, try setting the attrbutes in sets, so that the * reply bitmap will be correct for an error case. */ if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_OWNER) || NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_OWNERGROUP)) { NFSVNO_ATTRINIT(&nva2); NFSVNO_SETATTRVAL(&nva2, uid, nva.na_uid); NFSVNO_SETATTRVAL(&nva2, gid, nva.na_gid); nd->nd_repstat = nfsvno_setattr(vp, &nva2, nd->nd_cred, p, exp); if (!nd->nd_repstat) { if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_OWNER)) NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_OWNER); if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_OWNERGROUP)) NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_OWNERGROUP); } } if (!nd->nd_repstat && NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_SIZE)) { NFSVNO_ATTRINIT(&nva2); NFSVNO_SETATTRVAL(&nva2, size, nva.na_size); nd->nd_repstat = nfsvno_setattr(vp, &nva2, nd->nd_cred, p, exp); if (!nd->nd_repstat) NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_SIZE); } if (!nd->nd_repstat && (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET) || NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFYSET))) { NFSVNO_ATTRINIT(&nva2); NFSVNO_SETATTRVAL(&nva2, atime, nva.na_atime); NFSVNO_SETATTRVAL(&nva2, mtime, nva.na_mtime); if (nva.na_vaflags & VA_UTIMES_NULL) { nva2.na_vaflags |= VA_UTIMES_NULL; NFSVNO_SETACTIVE(&nva2, vaflags); } nd->nd_repstat = nfsvno_setattr(vp, &nva2, nd->nd_cred, p, exp); if (!nd->nd_repstat) { if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET)) NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_TIMEACCESSSET); if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFYSET)) NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_TIMEMODIFYSET); } } if (!nd->nd_repstat && NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE)) { NFSVNO_ATTRINIT(&nva2); NFSVNO_SETATTRVAL(&nva2, btime, nva.na_btime); nd->nd_repstat = nfsvno_setattr(vp, &nva2, nd->nd_cred, p, exp); if (!nd->nd_repstat) NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_TIMECREATE); } if (!nd->nd_repstat && (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_MODE) || NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_MODESETMASKED))) { NFSVNO_ATTRINIT(&nva2); NFSVNO_SETATTRVAL(&nva2, mode, nva.na_mode); nd->nd_repstat = nfsvno_setattr(vp, &nva2, nd->nd_cred, p, exp); if (!nd->nd_repstat) { if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_MODE)) NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_MODE); if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_MODESETMASKED)) NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_MODESETMASKED); } } #ifdef NFS4_ACL_EXTATTR_NAME if (!nd->nd_repstat && aclp->acl_cnt > 0 && NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_ACL)) { nd->nd_repstat = nfsrv_setacl(vp, aclp, nd->nd_cred, p); if (!nd->nd_repstat) NFSSETBIT_ATTRBIT(&retbits, NFSATTRBIT_ACL); } #endif } else if (!nd->nd_repstat) { nd->nd_repstat = nfsvno_setattr(vp, &nva, nd->nd_cred, p, exp); } if (nd->nd_flag & (ND_NFSV2 | ND_NFSV3)) { postat_ret = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); if (!nd->nd_repstat) nd->nd_repstat = postat_ret; } vput(vp); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif if (nd->nd_flag & ND_NFSV3) nfsrv_wcc(nd, preat_ret, &nva2, postat_ret, &nva); else if (nd->nd_flag & ND_NFSV4) (void) nfsrv_putattrbit(nd, &retbits); else if (!nd->nd_repstat) nfsrv_fillattr(nd, &nva); out: NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif if (nd->nd_flag & ND_NFSV4) { /* * For all nd_repstat, the V4 reply includes a bitmap, * even NFSERR_BADXDR, which is what this will end up * returning. */ (void) nfsrv_putattrbit(nd, &retbits); } NFSEXITCODE2(error, nd); return (error); } /* * nfs lookup rpc * (Also performs lookup parent for v4) */ int nfsrvd_lookup(struct nfsrv_descript *nd, __unused int isdgram, vnode_t dp, vnode_t *vpp, fhandle_t *fhp, struct nfsexstuff *exp) { struct nameidata named; vnode_t vp, dirp = NULL; int error = 0, dattr_ret = 1; struct nfsvattr nva, dattr; char *bufp; u_long *hashp; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_postopattr(nd, dattr_ret, &dattr); goto out; } /* * For some reason, if dp is a symlink, the error * returned is supposed to be NFSERR_SYMLINK and not NFSERR_NOTDIR. */ if (dp->v_type == VLNK && (nd->nd_flag & ND_NFSV4)) { nd->nd_repstat = NFSERR_SYMLINK; vrele(dp); goto out; } NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, LOOKUP, LOCKLEAF | SAVESTART); nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (error) { vrele(dp); nfsvno_relpathbuf(&named); goto out; } if (!nd->nd_repstat) { nd->nd_repstat = nfsvno_namei(nd, &named, dp, 0, exp, &dirp); } else { vrele(dp); nfsvno_relpathbuf(&named); } if (nd->nd_repstat) { if (dirp) { if (nd->nd_flag & ND_NFSV3) dattr_ret = nfsvno_getattr(dirp, &dattr, nd, p, 0, NULL); vrele(dirp); } if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, dattr_ret, &dattr); goto out; } if (named.ni_startdir) vrele(named.ni_startdir); nfsvno_relpathbuf(&named); vp = named.ni_vp; if ((nd->nd_flag & ND_NFSV4) != 0 && !NFSVNO_EXPORTED(exp) && vp->v_type != VDIR && vp->v_type != VLNK) /* * Only allow lookup of VDIR and VLNK for traversal of * non-exported volumes during NFSv4 mounting. */ nd->nd_repstat = ENOENT; if (nd->nd_repstat == 0) nd->nd_repstat = nfsvno_getfh(vp, fhp, p); if (!(nd->nd_flag & ND_NFSV4) && !nd->nd_repstat) nd->nd_repstat = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); if (vpp != NULL && nd->nd_repstat == 0) *vpp = vp; else vput(vp); if (dirp) { if (nd->nd_flag & ND_NFSV3) dattr_ret = nfsvno_getattr(dirp, &dattr, nd, p, 0, NULL); vrele(dirp); } if (nd->nd_repstat) { if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, dattr_ret, &dattr); goto out; } if (nd->nd_flag & ND_NFSV2) { (void) nfsm_fhtom(nd, (u_int8_t *)fhp, 0, 0); nfsrv_fillattr(nd, &nva); } else if (nd->nd_flag & ND_NFSV3) { (void) nfsm_fhtom(nd, (u_int8_t *)fhp, 0, 0); nfsrv_postopattr(nd, 0, &nva); nfsrv_postopattr(nd, dattr_ret, &dattr); } out: NFSEXITCODE2(error, nd); return (error); } /* * nfs readlink service */ int nfsrvd_readlink(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; struct mbuf *mp = NULL, *mpend = NULL; int getret = 1, len; struct nfsvattr nva; struct thread *p = curthread; uint16_t off; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &nva); goto out; } if (vp->v_type != VLNK) { if (nd->nd_flag & ND_NFSV2) nd->nd_repstat = ENXIO; else nd->nd_repstat = EINVAL; } if (nd->nd_repstat == 0) { if ((nd->nd_flag & ND_EXTPG) != 0) nd->nd_repstat = nfsvno_readlink(vp, nd->nd_cred, nd->nd_maxextsiz, p, &mp, &mpend, &len); else nd->nd_repstat = nfsvno_readlink(vp, nd->nd_cred, 0, p, &mp, &mpend, &len); } if (nd->nd_flag & ND_NFSV3) getret = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); vput(vp); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &nva); if (nd->nd_repstat) goto out; NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(len); if (mp != NULL) { nd->nd_mb->m_next = mp; nd->nd_mb = mpend; if ((mpend->m_flags & M_EXTPG) != 0) { nd->nd_bextpg = mpend->m_epg_npgs - 1; nd->nd_bpos = (char *)(void *) PHYS_TO_DMAP(mpend->m_epg_pa[nd->nd_bextpg]); off = (nd->nd_bextpg == 0) ? mpend->m_epg_1st_off : 0; nd->nd_bpos += off + mpend->m_epg_last_len; nd->nd_bextpgsiz = PAGE_SIZE - mpend->m_epg_last_len - off; } else nd->nd_bpos = mtod(mpend, char *) + mpend->m_len; } out: NFSEXITCODE2(0, nd); return (0); } /* * nfs read service */ int nfsrvd_read(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { u_int32_t *tl; int error = 0, cnt, getret = 1, gotproxystateid, reqlen, eof = 0; struct mbuf *m2, *m3; struct nfsvattr nva; off_t off = 0x0; struct nfsstate st, *stp = &st; struct nfslock lo, *lop = &lo; nfsv4stateid_t stateid; nfsquad_t clientid; struct thread *p = curthread; uint16_t poff; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &nva); goto out; } if (nd->nd_flag & ND_NFSV2) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); off = (off_t)fxdr_unsigned(u_int32_t, *tl++); reqlen = fxdr_unsigned(int, *tl); } else if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); off = fxdr_hyper(tl); tl += 2; reqlen = fxdr_unsigned(int, *tl); } else { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 3*NFSX_UNSIGNED); reqlen = fxdr_unsigned(int, *(tl + 6)); } if (reqlen > NFS_SRVMAXDATA(nd)) { reqlen = NFS_SRVMAXDATA(nd); } else if (reqlen < 0) { error = EBADRPC; goto nfsmout; } gotproxystateid = 0; if (nd->nd_flag & ND_NFSV4) { stp->ls_flags = (NFSLCK_CHECK | NFSLCK_READACCESS); lop->lo_flags = NFSLCK_READ; stp->ls_ownerlen = 0; stp->ls_op = NULL; stp->ls_uid = nd->nd_cred->cr_uid; stp->ls_stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); clientid.lval[0] = stp->ls_stateid.other[0] = *tl++; clientid.lval[1] = stp->ls_stateid.other[1] = *tl++; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK1 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } stp->ls_stateid.other[2] = *tl++; /* * Don't allow the client to use a special stateid for a DS op. */ if ((nd->nd_flag & ND_DSSERVER) != 0 && ((stp->ls_stateid.other[0] == 0x0 && stp->ls_stateid.other[1] == 0x0 && stp->ls_stateid.other[2] == 0x0) || (stp->ls_stateid.other[0] == 0xffffffff && stp->ls_stateid.other[1] == 0xffffffff && stp->ls_stateid.other[2] == 0xffffffff) || stp->ls_stateid.seqid != 0)) nd->nd_repstat = NFSERR_BADSTATEID; /* However, allow the proxy stateid. */ if (stp->ls_stateid.seqid == 0xffffffff && stp->ls_stateid.other[0] == 0x55555555 && stp->ls_stateid.other[1] == 0x55555555 && stp->ls_stateid.other[2] == 0x55555555) gotproxystateid = 1; off = fxdr_hyper(tl); lop->lo_first = off; tl += 2; lop->lo_end = off + reqlen; /* * Paranoia, just in case it wraps around. */ if (lop->lo_end < off) lop->lo_end = NFS64BITSSET; } if (vp->v_type != VREG) { if (nd->nd_flag & ND_NFSV3) nd->nd_repstat = EINVAL; else nd->nd_repstat = (vp->v_type == VDIR) ? EISDIR : EINVAL; } getret = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); if (!nd->nd_repstat) nd->nd_repstat = getret; if (!nd->nd_repstat && (nva.na_uid != nd->nd_cred->cr_uid || NFSVNO_EXSTRICTACCESS(exp))) { nd->nd_repstat = nfsvno_accchk(vp, VREAD, nd->nd_cred, exp, p, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat) nd->nd_repstat = nfsvno_accchk(vp, VEXEC, nd->nd_cred, exp, p, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); } /* * DS reads are marked by ND_DSSERVER or use the proxy special * stateid. */ if (nd->nd_repstat == 0 && (nd->nd_flag & (ND_NFSV4 | ND_DSSERVER)) == ND_NFSV4 && gotproxystateid == 0) nd->nd_repstat = nfsrv_lockctrl(vp, &stp, &lop, NULL, clientid, &stateid, exp, nd, p); if (nd->nd_repstat) { vput(vp); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &nva); goto out; } if (off >= nva.na_size) { cnt = 0; eof = 1; } else if (reqlen == 0) cnt = 0; else if ((off + reqlen) >= nva.na_size) { cnt = nva.na_size - off; eof = 1; } else cnt = reqlen; m3 = NULL; if (cnt > 0) { /* * If cnt > MCLBYTES and the reply will not be saved, use * ext_pgs mbufs for TLS. * For NFSv4.0, we do not know for sure if the reply will * be saved, so do not use ext_pgs mbufs for NFSv4.0. * Always use ext_pgs mbufs if ND_EXTPG is set. */ if ((nd->nd_flag & ND_EXTPG) != 0 || (cnt > MCLBYTES && (nd->nd_flag & (ND_TLS | ND_SAVEREPLY)) == ND_TLS && (nd->nd_flag & (ND_NFSV4 | ND_NFSV41)) != ND_NFSV4)) nd->nd_repstat = nfsvno_read(vp, off, cnt, nd->nd_cred, nd->nd_maxextsiz, p, &m3, &m2); else nd->nd_repstat = nfsvno_read(vp, off, cnt, nd->nd_cred, 0, p, &m3, &m2); if (!(nd->nd_flag & ND_NFSV4)) { getret = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); if (!nd->nd_repstat) nd->nd_repstat = getret; } if (nd->nd_repstat) { vput(vp); if (m3) m_freem(m3); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &nva); goto out; } } vput(vp); if (nd->nd_flag & ND_NFSV2) { nfsrv_fillattr(nd, &nva); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); } else { if (nd->nd_flag & ND_NFSV3) { nfsrv_postopattr(nd, getret, &nva); NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(cnt); } else NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (eof) *tl++ = newnfs_true; else *tl++ = newnfs_false; } *tl = txdr_unsigned(cnt); if (m3) { nd->nd_mb->m_next = m3; nd->nd_mb = m2; if ((m2->m_flags & M_EXTPG) != 0) { nd->nd_flag |= ND_EXTPG; nd->nd_bextpg = m2->m_epg_npgs - 1; nd->nd_bpos = (char *)(void *) PHYS_TO_DMAP(m2->m_epg_pa[nd->nd_bextpg]); poff = (nd->nd_bextpg == 0) ? m2->m_epg_1st_off : 0; nd->nd_bpos += poff + m2->m_epg_last_len; nd->nd_bextpgsiz = PAGE_SIZE - m2->m_epg_last_len - poff; } else nd->nd_bpos = mtod(m2, char *) + m2->m_len; } out: NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfs write service */ int nfsrvd_write(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { u_int32_t *tl; struct nfsvattr nva, forat; int aftat_ret = 1, retlen, len, error = 0, forat_ret = 1; int gotproxystateid, stable = NFSWRITE_FILESYNC; off_t off; struct nfsstate st, *stp = &st; struct nfslock lo, *lop = &lo; nfsv4stateid_t stateid; nfsquad_t clientid; nfsattrbit_t attrbits; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_wcc(nd, forat_ret, &forat, aftat_ret, &nva); goto out; } gotproxystateid = 0; if (nd->nd_flag & ND_NFSV2) { NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); off = (off_t)fxdr_unsigned(u_int32_t, *++tl); tl += 2; retlen = len = fxdr_unsigned(int32_t, *tl); } else if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); off = fxdr_hyper(tl); tl += 3; stable = fxdr_unsigned(int, *tl++); retlen = len = fxdr_unsigned(int32_t, *tl); } else { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 4 * NFSX_UNSIGNED); stp->ls_flags = (NFSLCK_CHECK | NFSLCK_WRITEACCESS); lop->lo_flags = NFSLCK_WRITE; stp->ls_ownerlen = 0; stp->ls_op = NULL; stp->ls_uid = nd->nd_cred->cr_uid; stp->ls_stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); clientid.lval[0] = stp->ls_stateid.other[0] = *tl++; clientid.lval[1] = stp->ls_stateid.other[1] = *tl++; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK2 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } stp->ls_stateid.other[2] = *tl++; /* * Don't allow the client to use a special stateid for a DS op. */ if ((nd->nd_flag & ND_DSSERVER) != 0 && ((stp->ls_stateid.other[0] == 0x0 && stp->ls_stateid.other[1] == 0x0 && stp->ls_stateid.other[2] == 0x0) || (stp->ls_stateid.other[0] == 0xffffffff && stp->ls_stateid.other[1] == 0xffffffff && stp->ls_stateid.other[2] == 0xffffffff) || stp->ls_stateid.seqid != 0)) nd->nd_repstat = NFSERR_BADSTATEID; /* However, allow the proxy stateid. */ if (stp->ls_stateid.seqid == 0xffffffff && stp->ls_stateid.other[0] == 0x55555555 && stp->ls_stateid.other[1] == 0x55555555 && stp->ls_stateid.other[2] == 0x55555555) gotproxystateid = 1; off = fxdr_hyper(tl); lop->lo_first = off; tl += 2; stable = fxdr_unsigned(int, *tl++); retlen = len = fxdr_unsigned(int32_t, *tl); lop->lo_end = off + len; /* * Paranoia, just in case it wraps around, which shouldn't * ever happen anyhow. */ if (lop->lo_end < lop->lo_first) lop->lo_end = NFS64BITSSET; } if (retlen > nfs_srvmaxio || retlen < 0) nd->nd_repstat = EIO; if (vp->v_type != VREG && !nd->nd_repstat) { if (nd->nd_flag & ND_NFSV3) nd->nd_repstat = EINVAL; else nd->nd_repstat = (vp->v_type == VDIR) ? EISDIR : EINVAL; } NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNER); forat_ret = nfsvno_getattr(vp, &forat, nd, p, 1, &attrbits); if (!nd->nd_repstat) nd->nd_repstat = forat_ret; if (!nd->nd_repstat && (forat.na_uid != nd->nd_cred->cr_uid || NFSVNO_EXSTRICTACCESS(exp))) nd->nd_repstat = nfsvno_accchk(vp, VWRITE, nd->nd_cred, exp, p, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); /* * DS reads are marked by ND_DSSERVER or use the proxy special * stateid. */ if (nd->nd_repstat == 0 && (nd->nd_flag & (ND_NFSV4 | ND_DSSERVER)) == ND_NFSV4 && gotproxystateid == 0) nd->nd_repstat = nfsrv_lockctrl(vp, &stp, &lop, NULL, clientid, &stateid, exp, nd, p); if (nd->nd_repstat) { vput(vp); if (nd->nd_flag & ND_NFSV3) nfsrv_wcc(nd, forat_ret, &forat, aftat_ret, &nva); goto out; } /* * For NFS Version 2, it is not obvious what a write of zero length * should do, but I might as well be consistent with Version 3, * which is to return ok so long as there are no permission problems. */ if (retlen > 0) { nd->nd_repstat = nfsvno_write(vp, off, retlen, &stable, nd->nd_md, nd->nd_dpos, nd->nd_cred, p); error = nfsm_advance(nd, NFSM_RNDUP(retlen), -1); if (error) goto nfsmout; } if (nd->nd_flag & ND_NFSV4) aftat_ret = 0; else aftat_ret = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); vput(vp); if (!nd->nd_repstat) nd->nd_repstat = aftat_ret; if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { if (nd->nd_flag & ND_NFSV3) nfsrv_wcc(nd, forat_ret, &forat, aftat_ret, &nva); if (nd->nd_repstat) goto out; NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(retlen); /* * If nfs_async is set, then pretend the write was FILESYNC. * Warning: Doing this violates RFC1813 and runs a risk * of data written by a client being lost when the server * crashes/reboots. */ if (stable == NFSWRITE_UNSTABLE && nfs_async == 0) *tl++ = txdr_unsigned(stable); else *tl++ = txdr_unsigned(NFSWRITE_FILESYNC); /* * Actually, there is no need to txdr these fields, * but it may make the values more human readable, * for debugging purposes. */ *tl++ = txdr_unsigned(nfsboottime.tv_sec); *tl = txdr_unsigned(nfsboottime.tv_usec); } else if (!nd->nd_repstat) nfsrv_fillattr(nd, &nva); out: NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfs create service (creates regular files for V2 and V3. Spec. files for V2.) * now does a truncate to 0 length via. setattr if it already exists * The core creation routine has been extracted out into nfsrv_creatsub(), * so it can also be used by nfsrv_open() for V4. */ int nfsrvd_create(struct nfsrv_descript *nd, __unused int isdgram, vnode_t dp, struct nfsexstuff *exp) { struct nfsvattr nva, dirfor, diraft; struct nfsv2_sattr *sp; struct nameidata named; u_int32_t *tl; int error = 0, tsize, dirfor_ret = 1, diraft_ret = 1; int how = NFSCREATE_UNCHECKED, exclusive_flag = 0; NFSDEV_T rdev = 0; vnode_t vp = NULL, dirp = NULL; fhandle_t fh; char *bufp; u_long *hashp; enum vtype vtyp; int32_t cverf[2], tverf[2] = { 0, 0 }; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); goto out; } NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, CREATE, LOCKPARENT | LOCKLEAF | SAVESTART | NOCACHE); nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (error) goto nfsmout; if (!nd->nd_repstat) { NFSVNO_ATTRINIT(&nva); if (nd->nd_flag & ND_NFSV2) { NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR); vtyp = IFTOVT(fxdr_unsigned(u_int32_t, sp->sa_mode)); if (vtyp == VNON) vtyp = VREG; NFSVNO_SETATTRVAL(&nva, type, vtyp); NFSVNO_SETATTRVAL(&nva, mode, nfstov_mode(sp->sa_mode)); switch (nva.na_type) { case VREG: tsize = fxdr_unsigned(int32_t, sp->sa_size); if (tsize != -1) NFSVNO_SETATTRVAL(&nva, size, (u_quad_t)tsize); break; case VCHR: case VBLK: case VFIFO: rdev = fxdr_unsigned(NFSDEV_T, sp->sa_size); break; default: break; } } else { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); how = fxdr_unsigned(int, *tl); switch (how) { case NFSCREATE_GUARDED: case NFSCREATE_UNCHECKED: error = nfsrv_sattr(nd, NULL, &nva, NULL, NULL, p); if (error) goto nfsmout; break; case NFSCREATE_EXCLUSIVE: NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF); cverf[0] = *tl++; cverf[1] = *tl; exclusive_flag = 1; break; } NFSVNO_SETATTRVAL(&nva, type, VREG); } } if (nd->nd_repstat) { nfsvno_relpathbuf(&named); if (nd->nd_flag & ND_NFSV3) { dirfor_ret = nfsvno_getattr(dp, &dirfor, nd, p, 1, NULL); nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); } vput(dp); goto out; } nd->nd_repstat = nfsvno_namei(nd, &named, dp, 1, exp, &dirp); if (dirp) { if (nd->nd_flag & ND_NFSV2) { vrele(dirp); dirp = NULL; } else { dirfor_ret = nfsvno_getattr(dirp, &dirfor, nd, p, 0, NULL); } } if (nd->nd_repstat) { if (nd->nd_flag & ND_NFSV3) nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); if (dirp) vrele(dirp); goto out; } if (!(nd->nd_flag & ND_NFSV2)) { switch (how) { case NFSCREATE_GUARDED: if (named.ni_vp) nd->nd_repstat = EEXIST; break; case NFSCREATE_UNCHECKED: break; case NFSCREATE_EXCLUSIVE: if (named.ni_vp == NULL) NFSVNO_SETATTRVAL(&nva, mode, 0); break; } } /* * Iff doesn't exist, create it * otherwise just truncate to 0 length * should I set the mode too ? */ nd->nd_repstat = nfsvno_createsub(nd, &named, &vp, &nva, &exclusive_flag, cverf, rdev, exp); if (!nd->nd_repstat) { nd->nd_repstat = nfsvno_getfh(vp, &fh, p); if (!nd->nd_repstat) nd->nd_repstat = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); vput(vp); if (!nd->nd_repstat) { tverf[0] = nva.na_atime.tv_sec; tverf[1] = nva.na_atime.tv_nsec; } } if (nd->nd_flag & ND_NFSV2) { if (!nd->nd_repstat) { (void) nfsm_fhtom(nd, (u_int8_t *)&fh, 0, 0); nfsrv_fillattr(nd, &nva); } } else { if (exclusive_flag && !nd->nd_repstat && (cverf[0] != tverf[0] || cverf[1] != tverf[1])) nd->nd_repstat = EEXIST; diraft_ret = nfsvno_getattr(dirp, &diraft, nd, p, 0, NULL); vrele(dirp); if (!nd->nd_repstat) { (void) nfsm_fhtom(nd, (u_int8_t *)&fh, 0, 1); nfsrv_postopattr(nd, 0, &nva); } nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); } out: NFSEXITCODE2(0, nd); return (0); nfsmout: vput(dp); nfsvno_relpathbuf(&named); NFSEXITCODE2(error, nd); return (error); } /* * nfs v3 mknod service (and v4 create) */ int nfsrvd_mknod(struct nfsrv_descript *nd, __unused int isdgram, vnode_t dp, vnode_t *vpp, fhandle_t *fhp, struct nfsexstuff *exp) { struct nfsvattr nva, dirfor, diraft; u_int32_t *tl; struct nameidata named; int error = 0, dirfor_ret = 1, diraft_ret = 1, pathlen; u_int32_t major, minor; enum vtype vtyp = VNON; nfstype nfs4type = NFNON; vnode_t vp, dirp = NULL; nfsattrbit_t attrbits; char *bufp = NULL, *pathcp = NULL; u_long *hashp, cnflags; NFSACL_T *aclp = NULL; struct thread *p = curthread; NFSVNO_ATTRINIT(&nva); cnflags = (LOCKPARENT | SAVESTART); if (nd->nd_repstat) { nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); goto out; } #ifdef NFS4_ACL_EXTATTR_NAME aclp = acl_alloc(M_WAITOK); aclp->acl_cnt = 0; #endif /* * For V4, the creation stuff is here, Yuck! */ if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); vtyp = nfsv34tov_type(*tl); nfs4type = fxdr_unsigned(nfstype, *tl); switch (nfs4type) { case NFLNK: error = nfsvno_getsymlink(nd, &nva, p, &pathcp, &pathlen); if (error) goto nfsmout; break; case NFCHR: case NFBLK: NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); major = fxdr_unsigned(u_int32_t, *tl++); minor = fxdr_unsigned(u_int32_t, *tl); nva.na_rdev = NFSMAKEDEV(major, minor); break; case NFSOCK: case NFFIFO: break; case NFDIR: - cnflags = (LOCKPARENT | SAVENAME); + cnflags = LOCKPARENT; break; default: nd->nd_repstat = NFSERR_BADTYPE; vrele(dp); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif goto out; } } NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, CREATE, cnflags | NOCACHE); nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (error) goto nfsmout; if (!nd->nd_repstat) { if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); vtyp = nfsv34tov_type(*tl); } error = nfsrv_sattr(nd, NULL, &nva, &attrbits, aclp, p); if (error) goto nfsmout; nva.na_type = vtyp; if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV3) && (vtyp == VCHR || vtyp == VBLK)) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); major = fxdr_unsigned(u_int32_t, *tl++); minor = fxdr_unsigned(u_int32_t, *tl); nva.na_rdev = NFSMAKEDEV(major, minor); } } dirfor_ret = nfsvno_getattr(dp, &dirfor, nd, p, 0, NULL); if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV4)) { if (!dirfor_ret && NFSVNO_ISSETGID(&nva) && dirfor.na_gid == nva.na_gid) NFSVNO_UNSET(&nva, gid); nd->nd_repstat = nfsrv_checkuidgid(nd, &nva); } if (nd->nd_repstat) { vrele(dp); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif nfsvno_relpathbuf(&named); if (pathcp) free(pathcp, M_TEMP); if (nd->nd_flag & ND_NFSV3) nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); goto out; } /* * Yuck! For V4, mkdir and link are here and some V4 clients don't fill * in va_mode, so we'll have to set a default here. */ if (NFSVNO_NOTSETMODE(&nva)) { if (vtyp == VLNK) nva.na_mode = 0755; else nva.na_mode = 0400; } if (vtyp == VDIR) named.ni_cnd.cn_flags |= WILLBEDIR; nd->nd_repstat = nfsvno_namei(nd, &named, dp, 0, exp, &dirp); if (nd->nd_repstat) { if (dirp) { if (nd->nd_flag & ND_NFSV3) dirfor_ret = nfsvno_getattr(dirp, &dirfor, nd, p, 0, NULL); vrele(dirp); } #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif if (nd->nd_flag & ND_NFSV3) nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); goto out; } if (dirp) dirfor_ret = nfsvno_getattr(dirp, &dirfor, nd, p, 0, NULL); if ((nd->nd_flag & ND_NFSV4) && (vtyp == VDIR || vtyp == VLNK)) { if (vtyp == VDIR) { nfsrvd_mkdirsub(nd, &named, &nva, fhp, vpp, dirp, &dirfor, &diraft, &diraft_ret, &attrbits, aclp, p, exp); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif goto out; } else if (vtyp == VLNK) { nfsrvd_symlinksub(nd, &named, &nva, fhp, vpp, dirp, &dirfor, &diraft, &diraft_ret, &attrbits, aclp, p, exp, pathcp, pathlen); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif free(pathcp, M_TEMP); goto out; } } nd->nd_repstat = nfsvno_mknod(&named, &nva, nd->nd_cred, p); if (!nd->nd_repstat) { vp = named.ni_vp; nfsrv_fixattr(nd, vp, &nva, aclp, p, &attrbits, exp); nd->nd_repstat = nfsvno_getfh(vp, fhp, p); if ((nd->nd_flag & ND_NFSV3) && !nd->nd_repstat) nd->nd_repstat = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); if (vpp != NULL && nd->nd_repstat == 0) { NFSVOPUNLOCK(vp); *vpp = vp; } else vput(vp); } diraft_ret = nfsvno_getattr(dirp, &diraft, nd, p, 0, NULL); vrele(dirp); if (!nd->nd_repstat) { if (nd->nd_flag & ND_NFSV3) { (void) nfsm_fhtom(nd, (u_int8_t *)fhp, 0, 1); nfsrv_postopattr(nd, 0, &nva); } else { NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = newnfs_false; txdr_hyper(dirfor.na_filerev, tl); tl += 2; txdr_hyper(diraft.na_filerev, tl); (void) nfsrv_putattrbit(nd, &attrbits); } } if (nd->nd_flag & ND_NFSV3) nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif out: NFSEXITCODE2(0, nd); return (0); nfsmout: vrele(dp); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif if (bufp) nfsvno_relpathbuf(&named); if (pathcp) free(pathcp, M_TEMP); NFSEXITCODE2(error, nd); return (error); } /* * nfs remove service */ int nfsrvd_remove(struct nfsrv_descript *nd, __unused int isdgram, vnode_t dp, struct nfsexstuff *exp) { struct nameidata named; u_int32_t *tl; int error = 0, dirfor_ret = 1, diraft_ret = 1; vnode_t dirp = NULL; struct nfsvattr dirfor, diraft; char *bufp; u_long *hashp; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); goto out; } NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, DELETE, LOCKPARENT | LOCKLEAF); nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (error) { vput(dp); nfsvno_relpathbuf(&named); goto out; } if (!nd->nd_repstat) { nd->nd_repstat = nfsvno_namei(nd, &named, dp, 1, exp, &dirp); } else { vput(dp); nfsvno_relpathbuf(&named); } if (dirp) { if (!(nd->nd_flag & ND_NFSV2)) { dirfor_ret = nfsvno_getattr(dirp, &dirfor, nd, p, 0, NULL); } else { vrele(dirp); dirp = NULL; } } if (!nd->nd_repstat) { if (nd->nd_flag & ND_NFSV4) { if (named.ni_vp->v_type == VDIR) nd->nd_repstat = nfsvno_rmdirsub(&named, 1, nd->nd_cred, p, exp); else nd->nd_repstat = nfsvno_removesub(&named, 1, nd->nd_cred, p, exp); } else if (nd->nd_procnum == NFSPROC_RMDIR) { nd->nd_repstat = nfsvno_rmdirsub(&named, 0, nd->nd_cred, p, exp); } else { nd->nd_repstat = nfsvno_removesub(&named, 0, nd->nd_cred, p, exp); } } if (!(nd->nd_flag & ND_NFSV2)) { if (dirp) { diraft_ret = nfsvno_getattr(dirp, &diraft, nd, p, 0, NULL); vrele(dirp); } if (nd->nd_flag & ND_NFSV3) { nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); } else if (!nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = newnfs_false; txdr_hyper(dirfor.na_filerev, tl); tl += 2; txdr_hyper(diraft.na_filerev, tl); } } out: NFSEXITCODE2(error, nd); return (error); } /* * nfs rename service */ int nfsrvd_rename(struct nfsrv_descript *nd, int isdgram, vnode_t dp, vnode_t todp, struct nfsexstuff *exp, struct nfsexstuff *toexp) { u_int32_t *tl; int error = 0, fdirfor_ret = 1, fdiraft_ret = 1; int tdirfor_ret = 1, tdiraft_ret = 1; struct nameidata fromnd, tond; vnode_t fdirp = NULL, tdirp = NULL, tdp = NULL; struct nfsvattr fdirfor, fdiraft, tdirfor, tdiraft; struct nfsexstuff tnes; struct nfsrvfh tfh; char *bufp, *tbufp = NULL; u_long *hashp; fhandle_t fh; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_wcc(nd, fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft); nfsrv_wcc(nd, tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft); goto out; } if (!(nd->nd_flag & ND_NFSV2)) fdirfor_ret = nfsvno_getattr(dp, &fdirfor, nd, p, 1, NULL); tond.ni_cnd.cn_nameiop = 0; tond.ni_startdir = NULL; NFSNAMEICNDSET(&fromnd.ni_cnd, nd->nd_cred, DELETE, WANTPARENT | SAVESTART); nfsvno_setpathbuf(&fromnd, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &fromnd.ni_pathlen); if (error) { vput(dp); if (todp) vrele(todp); nfsvno_relpathbuf(&fromnd); goto out; } /* * Unlock dp in this code section, so it is unlocked before * tdp gets locked. This avoids a potential LOR if tdp is the * parent directory of dp. */ if (nd->nd_flag & ND_NFSV4) { tdp = todp; tnes = *toexp; if (dp != tdp) { NFSVOPUNLOCK(dp); /* Might lock tdp. */ tdirfor_ret = nfsvno_getattr(tdp, &tdirfor, nd, p, 0, NULL); } else { tdirfor_ret = nfsvno_getattr(tdp, &tdirfor, nd, p, 1, NULL); NFSVOPUNLOCK(dp); } } else { tfh.nfsrvfh_len = 0; error = nfsrv_mtofh(nd, &tfh); if (error == 0) error = nfsvno_getfh(dp, &fh, p); if (error) { vput(dp); /* todp is always NULL except NFSv4 */ nfsvno_relpathbuf(&fromnd); goto out; } /* If this is the same file handle, just VREF() the vnode. */ if (tfh.nfsrvfh_len == NFSX_MYFH && !NFSBCMP(tfh.nfsrvfh_data, &fh, NFSX_MYFH)) { VREF(dp); tdp = dp; tnes = *exp; tdirfor_ret = nfsvno_getattr(tdp, &tdirfor, nd, p, 1, NULL); NFSVOPUNLOCK(dp); } else { NFSVOPUNLOCK(dp); nd->nd_cred->cr_uid = nd->nd_saveduid; nfsd_fhtovp(nd, &tfh, LK_EXCLUSIVE, &tdp, &tnes, NULL, 0, -1); /* Locks tdp. */ if (tdp) { tdirfor_ret = nfsvno_getattr(tdp, &tdirfor, nd, p, 1, NULL); NFSVOPUNLOCK(tdp); } } } NFSNAMEICNDSET(&tond.ni_cnd, nd->nd_cred, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART); nfsvno_setpathbuf(&tond, &tbufp, &hashp); if (!nd->nd_repstat) { error = nfsrv_parsename(nd, tbufp, hashp, &tond.ni_pathlen); if (error) { if (tdp) vrele(tdp); vrele(dp); nfsvno_relpathbuf(&fromnd); nfsvno_relpathbuf(&tond); goto out; } } if (nd->nd_repstat) { if (nd->nd_flag & ND_NFSV3) { nfsrv_wcc(nd, fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft); nfsrv_wcc(nd, tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft); } if (tdp) vrele(tdp); vrele(dp); nfsvno_relpathbuf(&fromnd); nfsvno_relpathbuf(&tond); goto out; } /* * Done parsing, now down to business. */ nd->nd_repstat = nfsvno_namei(nd, &fromnd, dp, 0, exp, &fdirp); if (nd->nd_repstat) { if (nd->nd_flag & ND_NFSV3) { nfsrv_wcc(nd, fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft); nfsrv_wcc(nd, tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft); } if (fdirp) vrele(fdirp); if (tdp) vrele(tdp); nfsvno_relpathbuf(&tond); goto out; } if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; nd->nd_repstat = nfsvno_namei(nd, &tond, tdp, 0, &tnes, &tdirp); nd->nd_repstat = nfsvno_rename(&fromnd, &tond, nd->nd_repstat, nd->nd_flag, nd->nd_cred, p); if (fdirp) fdiraft_ret = nfsvno_getattr(fdirp, &fdiraft, nd, p, 0, NULL); if (tdirp) tdiraft_ret = nfsvno_getattr(tdirp, &tdiraft, nd, p, 0, NULL); if (fdirp) vrele(fdirp); if (tdirp) vrele(tdirp); if (nd->nd_flag & ND_NFSV3) { nfsrv_wcc(nd, fdirfor_ret, &fdirfor, fdiraft_ret, &fdiraft); nfsrv_wcc(nd, tdirfor_ret, &tdirfor, tdiraft_ret, &tdiraft); } else if ((nd->nd_flag & ND_NFSV4) && !nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, 10 * NFSX_UNSIGNED); *tl++ = newnfs_false; txdr_hyper(fdirfor.na_filerev, tl); tl += 2; txdr_hyper(fdiraft.na_filerev, tl); tl += 2; *tl++ = newnfs_false; txdr_hyper(tdirfor.na_filerev, tl); tl += 2; txdr_hyper(tdiraft.na_filerev, tl); } out: NFSEXITCODE2(error, nd); return (error); } /* * nfs link service */ int nfsrvd_link(struct nfsrv_descript *nd, int isdgram, vnode_t vp, vnode_t tovp, struct nfsexstuff *exp, struct nfsexstuff *toexp) { struct nameidata named; u_int32_t *tl; int error = 0, dirfor_ret = 1, diraft_ret = 1, getret = 1; vnode_t dirp = NULL, dp = NULL; struct nfsvattr dirfor, diraft, at; struct nfsexstuff tnes; struct nfsrvfh dfh; char *bufp; u_long *hashp; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); goto out; } NFSVOPUNLOCK(vp); if (vp->v_type == VDIR) { if (nd->nd_flag & ND_NFSV4) nd->nd_repstat = NFSERR_ISDIR; else nd->nd_repstat = NFSERR_INVAL; if (tovp) vrele(tovp); } if (!nd->nd_repstat) { if (nd->nd_flag & ND_NFSV4) { dp = tovp; tnes = *toexp; } else { error = nfsrv_mtofh(nd, &dfh); if (error) { vrele(vp); /* tovp is always NULL unless NFSv4 */ goto out; } nfsd_fhtovp(nd, &dfh, LK_EXCLUSIVE, &dp, &tnes, NULL, 0, -1); if (dp) NFSVOPUNLOCK(dp); } } - NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, CREATE, - LOCKPARENT | SAVENAME | NOCACHE); + NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, CREATE, LOCKPARENT | NOCACHE); if (!nd->nd_repstat) { nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (error) { vrele(vp); if (dp) vrele(dp); nfsvno_relpathbuf(&named); goto out; } if (!nd->nd_repstat) { nd->nd_repstat = nfsvno_namei(nd, &named, dp, 0, &tnes, &dirp); } else { if (dp) vrele(dp); nfsvno_relpathbuf(&named); } } if (dirp) { if (nd->nd_flag & ND_NFSV2) { vrele(dirp); dirp = NULL; } else { dirfor_ret = nfsvno_getattr(dirp, &dirfor, nd, p, 0, NULL); } } if (!nd->nd_repstat) nd->nd_repstat = nfsvno_link(&named, vp, nd->nd_cred, p, exp); if (nd->nd_flag & ND_NFSV3) getret = nfsvno_getattr(vp, &at, nd, p, 0, NULL); if (dirp) { diraft_ret = nfsvno_getattr(dirp, &diraft, nd, p, 0, NULL); vrele(dirp); } vrele(vp); if (nd->nd_flag & ND_NFSV3) { nfsrv_postopattr(nd, getret, &at); nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); } else if ((nd->nd_flag & ND_NFSV4) && !nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = newnfs_false; txdr_hyper(dirfor.na_filerev, tl); tl += 2; txdr_hyper(diraft.na_filerev, tl); } out: NFSEXITCODE2(error, nd); return (error); } /* * nfs symbolic link service */ int nfsrvd_symlink(struct nfsrv_descript *nd, __unused int isdgram, vnode_t dp, vnode_t *vpp, fhandle_t *fhp, struct nfsexstuff *exp) { struct nfsvattr nva, dirfor, diraft; struct nameidata named; int error = 0, dirfor_ret = 1, diraft_ret = 1, pathlen; vnode_t dirp = NULL; char *bufp, *pathcp = NULL; u_long *hashp; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); goto out; } if (vpp) *vpp = NULL; NFSVNO_ATTRINIT(&nva); NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, CREATE, LOCKPARENT | SAVESTART | NOCACHE); nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (!error && !nd->nd_repstat) error = nfsvno_getsymlink(nd, &nva, p, &pathcp, &pathlen); if (error) { vrele(dp); nfsvno_relpathbuf(&named); goto out; } if (!nd->nd_repstat) { nd->nd_repstat = nfsvno_namei(nd, &named, dp, 0, exp, &dirp); } else { vrele(dp); nfsvno_relpathbuf(&named); } if (dirp != NULL && !(nd->nd_flag & ND_NFSV3)) { vrele(dirp); dirp = NULL; } /* * And call nfsrvd_symlinksub() to do the common code. It will * return EBADRPC upon a parsing error, 0 otherwise. */ if (!nd->nd_repstat) { if (dirp != NULL) dirfor_ret = nfsvno_getattr(dirp, &dirfor, nd, p, 0, NULL); nfsrvd_symlinksub(nd, &named, &nva, fhp, vpp, dirp, &dirfor, &diraft, &diraft_ret, NULL, NULL, p, exp, pathcp, pathlen); } else if (dirp != NULL) { dirfor_ret = nfsvno_getattr(dirp, &dirfor, nd, p, 0, NULL); vrele(dirp); } if (pathcp) free(pathcp, M_TEMP); if (nd->nd_flag & ND_NFSV3) { if (!nd->nd_repstat) { (void) nfsm_fhtom(nd, (u_int8_t *)fhp, 0, 1); nfsrv_postopattr(nd, 0, &nva); } nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); } out: NFSEXITCODE2(error, nd); return (error); } /* * Common code for creating a symbolic link. */ static void nfsrvd_symlinksub(struct nfsrv_descript *nd, struct nameidata *ndp, struct nfsvattr *nvap, fhandle_t *fhp, vnode_t *vpp, vnode_t dirp, struct nfsvattr *dirforp, struct nfsvattr *diraftp, int *diraft_retp, nfsattrbit_t *attrbitp, NFSACL_T *aclp, NFSPROC_T *p, struct nfsexstuff *exp, char *pathcp, int pathlen) { u_int32_t *tl; nd->nd_repstat = nfsvno_symlink(ndp, nvap, pathcp, pathlen, !(nd->nd_flag & ND_NFSV2), nd->nd_saveduid, nd->nd_cred, p, exp); if (!nd->nd_repstat && !(nd->nd_flag & ND_NFSV2)) { nfsrv_fixattr(nd, ndp->ni_vp, nvap, aclp, p, attrbitp, exp); if (nd->nd_flag & ND_NFSV3) { nd->nd_repstat = nfsvno_getfh(ndp->ni_vp, fhp, p); if (!nd->nd_repstat) nd->nd_repstat = nfsvno_getattr(ndp->ni_vp, nvap, nd, p, 1, NULL); } if (vpp != NULL && nd->nd_repstat == 0) { NFSVOPUNLOCK(ndp->ni_vp); *vpp = ndp->ni_vp; } else vput(ndp->ni_vp); } if (dirp) { *diraft_retp = nfsvno_getattr(dirp, diraftp, nd, p, 0, NULL); vrele(dirp); } if ((nd->nd_flag & ND_NFSV4) && !nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = newnfs_false; txdr_hyper(dirforp->na_filerev, tl); tl += 2; txdr_hyper(diraftp->na_filerev, tl); (void) nfsrv_putattrbit(nd, attrbitp); } NFSEXITCODE2(0, nd); } /* * nfs mkdir service */ int nfsrvd_mkdir(struct nfsrv_descript *nd, __unused int isdgram, vnode_t dp, vnode_t *vpp, fhandle_t *fhp, struct nfsexstuff *exp) { struct nfsvattr nva, dirfor, diraft; struct nameidata named; u_int32_t *tl; int error = 0, dirfor_ret = 1, diraft_ret = 1; vnode_t dirp = NULL; char *bufp; u_long *hashp; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); goto out; } - NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, CREATE, - LOCKPARENT | SAVENAME | NOCACHE); + NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, CREATE, LOCKPARENT | NOCACHE); nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (error) goto nfsmout; if (!nd->nd_repstat) { NFSVNO_ATTRINIT(&nva); if (nd->nd_flag & ND_NFSV3) { error = nfsrv_sattr(nd, NULL, &nva, NULL, NULL, p); if (error) goto nfsmout; } else { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nva.na_mode = nfstov_mode(*tl++); } } if (!nd->nd_repstat) { nd->nd_repstat = nfsvno_namei(nd, &named, dp, 0, exp, &dirp); } else { vrele(dp); nfsvno_relpathbuf(&named); } if (dirp != NULL && !(nd->nd_flag & ND_NFSV3)) { vrele(dirp); dirp = NULL; } if (nd->nd_repstat) { if (dirp != NULL) { dirfor_ret = nfsvno_getattr(dirp, &dirfor, nd, p, 0, NULL); vrele(dirp); } if (nd->nd_flag & ND_NFSV3) nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); goto out; } if (dirp != NULL) dirfor_ret = nfsvno_getattr(dirp, &dirfor, nd, p, 0, NULL); /* * Call nfsrvd_mkdirsub() for the code common to V4 as well. */ nfsrvd_mkdirsub(nd, &named, &nva, fhp, vpp, dirp, &dirfor, &diraft, &diraft_ret, NULL, NULL, p, exp); if (nd->nd_flag & ND_NFSV3) { if (!nd->nd_repstat) { (void) nfsm_fhtom(nd, (u_int8_t *)fhp, 0, 1); nfsrv_postopattr(nd, 0, &nva); } nfsrv_wcc(nd, dirfor_ret, &dirfor, diraft_ret, &diraft); } else if (!nd->nd_repstat) { (void) nfsm_fhtom(nd, (u_int8_t *)fhp, 0, 0); nfsrv_fillattr(nd, &nva); } out: NFSEXITCODE2(0, nd); return (0); nfsmout: vrele(dp); nfsvno_relpathbuf(&named); NFSEXITCODE2(error, nd); return (error); } /* * Code common to mkdir for V2,3 and 4. */ static void nfsrvd_mkdirsub(struct nfsrv_descript *nd, struct nameidata *ndp, struct nfsvattr *nvap, fhandle_t *fhp, vnode_t *vpp, vnode_t dirp, struct nfsvattr *dirforp, struct nfsvattr *diraftp, int *diraft_retp, nfsattrbit_t *attrbitp, NFSACL_T *aclp, NFSPROC_T *p, struct nfsexstuff *exp) { vnode_t vp; u_int32_t *tl; NFSVNO_SETATTRVAL(nvap, type, VDIR); nd->nd_repstat = nfsvno_mkdir(ndp, nvap, nd->nd_saveduid, nd->nd_cred, p, exp); if (!nd->nd_repstat) { vp = ndp->ni_vp; nfsrv_fixattr(nd, vp, nvap, aclp, p, attrbitp, exp); nd->nd_repstat = nfsvno_getfh(vp, fhp, p); if (!(nd->nd_flag & ND_NFSV4) && !nd->nd_repstat) nd->nd_repstat = nfsvno_getattr(vp, nvap, nd, p, 1, NULL); if (vpp && !nd->nd_repstat) { NFSVOPUNLOCK(vp); *vpp = vp; } else { vput(vp); } } if (dirp) { *diraft_retp = nfsvno_getattr(dirp, diraftp, nd, p, 0, NULL); vrele(dirp); } if ((nd->nd_flag & ND_NFSV4) && !nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = newnfs_false; txdr_hyper(dirforp->na_filerev, tl); tl += 2; txdr_hyper(diraftp->na_filerev, tl); (void) nfsrv_putattrbit(nd, attrbitp); } NFSEXITCODE2(0, nd); } /* * nfs commit service */ int nfsrvd_commit(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { struct nfsvattr bfor, aft; u_int32_t *tl; int error = 0, for_ret = 1, aft_ret = 1, cnt; u_int64_t off; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_wcc(nd, for_ret, &bfor, aft_ret, &aft); goto out; } /* Return NFSERR_ISDIR in NFSv4 when commit on a directory. */ if (vp->v_type != VREG) { if (nd->nd_flag & ND_NFSV3) error = NFSERR_NOTSUPP; else error = (vp->v_type == VDIR) ? NFSERR_ISDIR : NFSERR_INVAL; goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); /* * XXX At this time VOP_FSYNC() does not accept offset and byte * count parameters, so these arguments are useless (someday maybe). */ off = fxdr_hyper(tl); tl += 2; cnt = fxdr_unsigned(int, *tl); if (nd->nd_flag & ND_NFSV3) for_ret = nfsvno_getattr(vp, &bfor, nd, p, 1, NULL); nd->nd_repstat = nfsvno_fsync(vp, off, cnt, nd->nd_cred, p); if (nd->nd_flag & ND_NFSV3) { aft_ret = nfsvno_getattr(vp, &aft, nd, p, 1, NULL); nfsrv_wcc(nd, for_ret, &bfor, aft_ret, &aft); } vput(vp); if (!nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = txdr_unsigned(nfsboottime.tv_sec); *tl = txdr_unsigned(nfsboottime.tv_usec); } out: NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfs statfs service */ int nfsrvd_statfs(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { struct statfs *sf; u_int32_t *tl; int getret = 1; struct nfsvattr at; u_quad_t tval; struct thread *p = curthread; sf = NULL; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); goto out; } sf = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); nd->nd_repstat = nfsvno_statfs(vp, sf); getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL); vput(vp); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); if (nd->nd_repstat) goto out; if (nd->nd_flag & ND_NFSV2) { NFSM_BUILD(tl, u_int32_t *, NFSX_V2STATFS); *tl++ = txdr_unsigned(NFS_V2MAXDATA); *tl++ = txdr_unsigned(sf->f_bsize); *tl++ = txdr_unsigned(sf->f_blocks); *tl++ = txdr_unsigned(sf->f_bfree); *tl = txdr_unsigned(sf->f_bavail); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_V3STATFS); tval = (u_quad_t)sf->f_blocks; tval *= (u_quad_t)sf->f_bsize; txdr_hyper(tval, tl); tl += 2; tval = (u_quad_t)sf->f_bfree; tval *= (u_quad_t)sf->f_bsize; txdr_hyper(tval, tl); tl += 2; tval = (u_quad_t)sf->f_bavail; tval *= (u_quad_t)sf->f_bsize; txdr_hyper(tval, tl); tl += 2; tval = (u_quad_t)sf->f_files; txdr_hyper(tval, tl); tl += 2; tval = (u_quad_t)sf->f_ffree; txdr_hyper(tval, tl); tl += 2; tval = (u_quad_t)sf->f_ffree; txdr_hyper(tval, tl); tl += 2; *tl = 0; } out: free(sf, M_STATFS); NFSEXITCODE2(0, nd); return (0); } /* * nfs fsinfo service */ int nfsrvd_fsinfo(struct nfsrv_descript *nd, int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; struct nfsfsinfo fs; int getret = 1; struct nfsvattr at; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); goto out; } getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL); nfsvno_getfs(&fs, isdgram); vput(vp); nfsrv_postopattr(nd, getret, &at); NFSM_BUILD(tl, u_int32_t *, NFSX_V3FSINFO); *tl++ = txdr_unsigned(fs.fs_rtmax); *tl++ = txdr_unsigned(fs.fs_rtpref); *tl++ = txdr_unsigned(fs.fs_rtmult); *tl++ = txdr_unsigned(fs.fs_wtmax); *tl++ = txdr_unsigned(fs.fs_wtpref); *tl++ = txdr_unsigned(fs.fs_wtmult); *tl++ = txdr_unsigned(fs.fs_dtpref); txdr_hyper(fs.fs_maxfilesize, tl); tl += 2; txdr_nfsv3time(&fs.fs_timedelta, tl); tl += 2; *tl = txdr_unsigned(fs.fs_properties); out: NFSEXITCODE2(0, nd); return (0); } /* * nfs pathconf service */ int nfsrvd_pathconf(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { struct nfsv3_pathconf *pc; int getret = 1; long linkmax, namemax, chownres, notrunc; struct nfsvattr at; struct thread *p = curthread; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); goto out; } nd->nd_repstat = nfsvno_pathconf(vp, _PC_LINK_MAX, &linkmax, nd->nd_cred, p); if (!nd->nd_repstat) nd->nd_repstat = nfsvno_pathconf(vp, _PC_NAME_MAX, &namemax, nd->nd_cred, p); if (!nd->nd_repstat) nd->nd_repstat=nfsvno_pathconf(vp, _PC_CHOWN_RESTRICTED, &chownres, nd->nd_cred, p); if (!nd->nd_repstat) nd->nd_repstat = nfsvno_pathconf(vp, _PC_NO_TRUNC, ¬runc, nd->nd_cred, p); getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL); vput(vp); nfsrv_postopattr(nd, getret, &at); if (!nd->nd_repstat) { NFSM_BUILD(pc, struct nfsv3_pathconf *, NFSX_V3PATHCONF); pc->pc_linkmax = txdr_unsigned(linkmax); pc->pc_namemax = txdr_unsigned(namemax); pc->pc_notrunc = txdr_unsigned(notrunc); pc->pc_chownrestricted = txdr_unsigned(chownres); /* * These should probably be supported by VOP_PATHCONF(), but * until msdosfs is exportable (why would you want to?), the * Unix defaults should be ok. */ pc->pc_caseinsensitive = newnfs_false; pc->pc_casepreserving = newnfs_true; } out: NFSEXITCODE2(0, nd); return (0); } /* * nfsv4 lock service */ int nfsrvd_lock(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { u_int32_t *tl; int i; struct nfsstate *stp = NULL; struct nfslock *lop; struct nfslockconflict cf; int error = 0; u_short flags = NFSLCK_LOCK, lflags; u_int64_t offset, len; nfsv4stateid_t stateid; nfsquad_t clientid; struct thread *p = curthread; NFSM_DISSECT(tl, u_int32_t *, 7 * NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl++); switch (i) { case NFSV4LOCKT_READW: flags |= NFSLCK_BLOCKING; case NFSV4LOCKT_READ: lflags = NFSLCK_READ; break; case NFSV4LOCKT_WRITEW: flags |= NFSLCK_BLOCKING; case NFSV4LOCKT_WRITE: lflags = NFSLCK_WRITE; break; default: nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } if (*tl++ == newnfs_true) flags |= NFSLCK_RECLAIM; offset = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); tl += 2; if (*tl == newnfs_true) flags |= NFSLCK_OPENTOLOCK; if (flags & NFSLCK_OPENTOLOCK) { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED + NFSX_STATEID); i = fxdr_unsigned(int, *(tl+4+(NFSX_STATEID / NFSX_UNSIGNED))); if (i <= 0 || i > NFSV4_OPAQUELIMIT) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } stp = malloc(sizeof (struct nfsstate) + i, M_NFSDSTATE, M_WAITOK); stp->ls_ownerlen = i; stp->ls_op = nd->nd_rp; stp->ls_seq = fxdr_unsigned(int, *tl++); stp->ls_stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); NFSBCOPY((caddr_t)tl, (caddr_t)stp->ls_stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); /* * For the special stateid of other all 0s and seqid == 1, set * the stateid to the current stateid, if it is set. */ if ((nd->nd_flag & ND_NFSV41) != 0 && stp->ls_stateid.seqid == 1 && stp->ls_stateid.other[0] == 0 && stp->ls_stateid.other[1] == 0 && stp->ls_stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) { stp->ls_stateid = nd->nd_curstateid; stp->ls_stateid.seqid = 0; } else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } stp->ls_opentolockseq = fxdr_unsigned(int, *tl++); clientid.lval[0] = *tl++; clientid.lval[1] = *tl++; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK3 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } error = nfsrv_mtostr(nd, stp->ls_owner, stp->ls_ownerlen); if (error) goto nfsmout; } else { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); stp = malloc(sizeof (struct nfsstate), M_NFSDSTATE, M_WAITOK); stp->ls_ownerlen = 0; stp->ls_op = nd->nd_rp; stp->ls_stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); NFSBCOPY((caddr_t)tl, (caddr_t)stp->ls_stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); /* * For the special stateid of other all 0s and seqid == 1, set * the stateid to the current stateid, if it is set. */ if ((nd->nd_flag & ND_NFSV41) != 0 && stp->ls_stateid.seqid == 1 && stp->ls_stateid.other[0] == 0 && stp->ls_stateid.other[1] == 0 && stp->ls_stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) { stp->ls_stateid = nd->nd_curstateid; stp->ls_stateid.seqid = 0; } else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } stp->ls_seq = fxdr_unsigned(int, *tl); clientid.lval[0] = stp->ls_stateid.other[0]; clientid.lval[1] = stp->ls_stateid.other[1]; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK4 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } } lop = malloc(sizeof (struct nfslock), M_NFSDLOCK, M_WAITOK); lop->lo_first = offset; if (len == NFS64BITSSET) { lop->lo_end = NFS64BITSSET; } else { lop->lo_end = offset + len; if (lop->lo_end <= lop->lo_first) nd->nd_repstat = NFSERR_INVAL; } lop->lo_flags = lflags; stp->ls_flags = flags; stp->ls_uid = nd->nd_cred->cr_uid; /* * Do basic access checking. */ if (!nd->nd_repstat && vp->v_type != VREG) { if (vp->v_type == VDIR) nd->nd_repstat = NFSERR_ISDIR; else nd->nd_repstat = NFSERR_INVAL; } if (!nd->nd_repstat) { if (lflags & NFSLCK_WRITE) { nd->nd_repstat = nfsvno_accchk(vp, VWRITE, nd->nd_cred, exp, p, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); } else { nd->nd_repstat = nfsvno_accchk(vp, VREAD, nd->nd_cred, exp, p, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat) nd->nd_repstat = nfsvno_accchk(vp, VEXEC, nd->nd_cred, exp, p, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); } } /* * We call nfsrv_lockctrl() even if nd_repstat set, so that the * seqid# gets updated. nfsrv_lockctrl() will return the value * of nd_repstat, if it gets that far. */ nd->nd_repstat = nfsrv_lockctrl(vp, &stp, &lop, &cf, clientid, &stateid, exp, nd, p); if (lop) free(lop, M_NFSDLOCK); if (stp) free(stp, M_NFSDSTATE); if (!nd->nd_repstat) { /* For NFSv4.1, set the Current StateID. */ if ((nd->nd_flag & ND_NFSV41) != 0) { nd->nd_curstateid = stateid; nd->nd_flag |= ND_CURSTATEID; } NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); *tl++ = txdr_unsigned(stateid.seqid); NFSBCOPY((caddr_t)stateid.other,(caddr_t)tl,NFSX_STATEIDOTHER); } else if (nd->nd_repstat == NFSERR_DENIED) { NFSM_BUILD(tl, u_int32_t *, 7 * NFSX_UNSIGNED); txdr_hyper(cf.cl_first, tl); tl += 2; if (cf.cl_end == NFS64BITSSET) len = NFS64BITSSET; else len = cf.cl_end - cf.cl_first; txdr_hyper(len, tl); tl += 2; if (cf.cl_flags == NFSLCK_WRITE) *tl++ = txdr_unsigned(NFSV4LOCKT_WRITE); else *tl++ = txdr_unsigned(NFSV4LOCKT_READ); *tl++ = stateid.other[0]; *tl = stateid.other[1]; (void) nfsm_strtom(nd, cf.cl_owner, cf.cl_ownerlen); } vput(vp); NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); if (stp) free(stp, M_NFSDSTATE); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 lock test service */ int nfsrvd_lockt(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { u_int32_t *tl; int i; struct nfsstate *stp = NULL; struct nfslock lo, *lop = &lo; struct nfslockconflict cf; int error = 0; nfsv4stateid_t stateid; nfsquad_t clientid; u_int64_t len; struct thread *p = curthread; NFSM_DISSECT(tl, u_int32_t *, 8 * NFSX_UNSIGNED); i = fxdr_unsigned(int, *(tl + 7)); if (i <= 0 || i > NFSV4_OPAQUELIMIT) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } stp = malloc(sizeof (struct nfsstate) + i, M_NFSDSTATE, M_WAITOK); stp->ls_ownerlen = i; stp->ls_op = NULL; stp->ls_flags = NFSLCK_TEST; stp->ls_uid = nd->nd_cred->cr_uid; i = fxdr_unsigned(int, *tl++); switch (i) { case NFSV4LOCKT_READW: stp->ls_flags |= NFSLCK_BLOCKING; case NFSV4LOCKT_READ: lo.lo_flags = NFSLCK_READ; break; case NFSV4LOCKT_WRITEW: stp->ls_flags |= NFSLCK_BLOCKING; case NFSV4LOCKT_WRITE: lo.lo_flags = NFSLCK_WRITE; break; default: nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } lo.lo_first = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); if (len == NFS64BITSSET) { lo.lo_end = NFS64BITSSET; } else { lo.lo_end = lo.lo_first + len; if (lo.lo_end <= lo.lo_first) nd->nd_repstat = NFSERR_INVAL; } tl += 2; clientid.lval[0] = *tl++; clientid.lval[1] = *tl; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK5 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } error = nfsrv_mtostr(nd, stp->ls_owner, stp->ls_ownerlen); if (error) goto nfsmout; if (!nd->nd_repstat && vp->v_type != VREG) { if (vp->v_type == VDIR) nd->nd_repstat = NFSERR_ISDIR; else nd->nd_repstat = NFSERR_INVAL; } if (!nd->nd_repstat) nd->nd_repstat = nfsrv_lockctrl(vp, &stp, &lop, &cf, clientid, &stateid, exp, nd, p); if (nd->nd_repstat) { if (nd->nd_repstat == NFSERR_DENIED) { NFSM_BUILD(tl, u_int32_t *, 7 * NFSX_UNSIGNED); txdr_hyper(cf.cl_first, tl); tl += 2; if (cf.cl_end == NFS64BITSSET) len = NFS64BITSSET; else len = cf.cl_end - cf.cl_first; txdr_hyper(len, tl); tl += 2; if (cf.cl_flags == NFSLCK_WRITE) *tl++ = txdr_unsigned(NFSV4LOCKT_WRITE); else *tl++ = txdr_unsigned(NFSV4LOCKT_READ); *tl++ = stp->ls_stateid.other[0]; *tl = stp->ls_stateid.other[1]; (void) nfsm_strtom(nd, cf.cl_owner, cf.cl_ownerlen); } } vput(vp); if (stp) free(stp, M_NFSDSTATE); NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); if (stp) free(stp, M_NFSDSTATE); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 unlock service */ int nfsrvd_locku(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { u_int32_t *tl; int i; struct nfsstate *stp; struct nfslock *lop; int error = 0; nfsv4stateid_t stateid; nfsquad_t clientid; u_int64_t len; struct thread *p = curthread; NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED + NFSX_STATEID); stp = malloc(sizeof (struct nfsstate), M_NFSDSTATE, M_WAITOK); lop = malloc(sizeof (struct nfslock), M_NFSDLOCK, M_WAITOK); stp->ls_flags = NFSLCK_UNLOCK; lop->lo_flags = NFSLCK_UNLOCK; stp->ls_op = nd->nd_rp; i = fxdr_unsigned(int, *tl++); switch (i) { case NFSV4LOCKT_READW: stp->ls_flags |= NFSLCK_BLOCKING; case NFSV4LOCKT_READ: break; case NFSV4LOCKT_WRITEW: stp->ls_flags |= NFSLCK_BLOCKING; case NFSV4LOCKT_WRITE: break; default: nd->nd_repstat = NFSERR_BADXDR; free(stp, M_NFSDSTATE); free(lop, M_NFSDLOCK); goto nfsmout; } stp->ls_ownerlen = 0; stp->ls_uid = nd->nd_cred->cr_uid; stp->ls_seq = fxdr_unsigned(int, *tl++); stp->ls_stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); NFSBCOPY((caddr_t)tl, (caddr_t)stp->ls_stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); /* * For the special stateid of other all 0s and seqid == 1, set the * stateid to the current stateid, if it is set. */ if ((nd->nd_flag & ND_NFSV41) != 0 && stp->ls_stateid.seqid == 1 && stp->ls_stateid.other[0] == 0 && stp->ls_stateid.other[1] == 0 && stp->ls_stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) { stp->ls_stateid = nd->nd_curstateid; stp->ls_stateid.seqid = 0; } else { nd->nd_repstat = NFSERR_BADSTATEID; free(stp, M_NFSDSTATE); free(lop, M_NFSDLOCK); goto nfsmout; } } lop->lo_first = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); if (len == NFS64BITSSET) { lop->lo_end = NFS64BITSSET; } else { lop->lo_end = lop->lo_first + len; if (lop->lo_end <= lop->lo_first) nd->nd_repstat = NFSERR_INVAL; } clientid.lval[0] = stp->ls_stateid.other[0]; clientid.lval[1] = stp->ls_stateid.other[1]; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK6 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } if (!nd->nd_repstat && vp->v_type != VREG) { if (vp->v_type == VDIR) nd->nd_repstat = NFSERR_ISDIR; else nd->nd_repstat = NFSERR_INVAL; } /* * Call nfsrv_lockctrl() even if nd_repstat is set, so that the * seqid# gets incremented. nfsrv_lockctrl() will return the * value of nd_repstat, if it gets that far. */ nd->nd_repstat = nfsrv_lockctrl(vp, &stp, &lop, NULL, clientid, &stateid, exp, nd, p); if (stp) free(stp, M_NFSDSTATE); if (lop) free(lop, M_NFSDLOCK); if (!nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); *tl++ = txdr_unsigned(stateid.seqid); NFSBCOPY((caddr_t)stateid.other,(caddr_t)tl,NFSX_STATEIDOTHER); } nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 open service */ int nfsrvd_open(struct nfsrv_descript *nd, __unused int isdgram, vnode_t dp, vnode_t *vpp, __unused fhandle_t *fhp, struct nfsexstuff *exp) { u_int32_t *tl; int i, retext; struct nfsstate *stp = NULL; int error = 0, create, claim, exclusive_flag = 0, override; u_int32_t rflags = NFSV4OPEN_LOCKTYPEPOSIX, acemask; int how = NFSCREATE_UNCHECKED; int32_t cverf[2], tverf[2] = { 0, 0 }; vnode_t vp = NULL, dirp = NULL; struct nfsvattr nva, dirfor, diraft; struct nameidata named; nfsv4stateid_t stateid, delegstateid; nfsattrbit_t attrbits; nfsquad_t clientid; char *bufp = NULL; u_long *hashp; NFSACL_T *aclp = NULL; struct thread *p = curthread; #ifdef NFS4_ACL_EXTATTR_NAME aclp = acl_alloc(M_WAITOK); aclp->acl_cnt = 0; #endif NFSZERO_ATTRBIT(&attrbits); named.ni_startdir = NULL; named.ni_cnd.cn_nameiop = 0; NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED); i = fxdr_unsigned(int, *(tl + 5)); if (i <= 0 || i > NFSV4_OPAQUELIMIT) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } stp = malloc(sizeof (struct nfsstate) + i, M_NFSDSTATE, M_WAITOK); stp->ls_ownerlen = i; stp->ls_op = nd->nd_rp; stp->ls_flags = NFSLCK_OPEN; stp->ls_uid = nd->nd_cred->cr_uid; stp->ls_seq = fxdr_unsigned(u_int32_t, *tl++); i = fxdr_unsigned(int, *tl++); retext = 0; if ((i & (NFSV4OPEN_WANTDELEGMASK | NFSV4OPEN_WANTSIGNALDELEG | NFSV4OPEN_WANTPUSHDELEG)) != 0 && (nd->nd_flag & ND_NFSV41) != 0) { retext = 1; /* For now, ignore these. */ i &= ~(NFSV4OPEN_WANTPUSHDELEG | NFSV4OPEN_WANTSIGNALDELEG); switch (i & NFSV4OPEN_WANTDELEGMASK) { case NFSV4OPEN_WANTANYDELEG: stp->ls_flags |= (NFSLCK_WANTRDELEG | NFSLCK_WANTWDELEG); i &= ~NFSV4OPEN_WANTDELEGMASK; break; case NFSV4OPEN_WANTREADDELEG: stp->ls_flags |= NFSLCK_WANTRDELEG; i &= ~NFSV4OPEN_WANTDELEGMASK; break; case NFSV4OPEN_WANTWRITEDELEG: stp->ls_flags |= NFSLCK_WANTWDELEG; i &= ~NFSV4OPEN_WANTDELEGMASK; break; case NFSV4OPEN_WANTNODELEG: stp->ls_flags |= NFSLCK_WANTNODELEG; i &= ~NFSV4OPEN_WANTDELEGMASK; break; case NFSV4OPEN_WANTCANCEL: printf("NFSv4: ignore Open WantCancel\n"); i &= ~NFSV4OPEN_WANTDELEGMASK; break; default: /* nd_repstat will be set to NFSERR_INVAL below. */ break; } } switch (i) { case NFSV4OPEN_ACCESSREAD: stp->ls_flags |= NFSLCK_READACCESS; break; case NFSV4OPEN_ACCESSWRITE: stp->ls_flags |= NFSLCK_WRITEACCESS; break; case NFSV4OPEN_ACCESSBOTH: stp->ls_flags |= (NFSLCK_READACCESS | NFSLCK_WRITEACCESS); break; default: nd->nd_repstat = NFSERR_INVAL; } i = fxdr_unsigned(int, *tl++); switch (i) { case NFSV4OPEN_DENYNONE: break; case NFSV4OPEN_DENYREAD: stp->ls_flags |= NFSLCK_READDENY; break; case NFSV4OPEN_DENYWRITE: stp->ls_flags |= NFSLCK_WRITEDENY; break; case NFSV4OPEN_DENYBOTH: stp->ls_flags |= (NFSLCK_READDENY | NFSLCK_WRITEDENY); break; default: nd->nd_repstat = NFSERR_INVAL; } clientid.lval[0] = *tl++; clientid.lval[1] = *tl; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK7 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } error = nfsrv_mtostr(nd, stp->ls_owner, stp->ls_ownerlen); if (error) goto nfsmout; NFSVNO_ATTRINIT(&nva); NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); create = fxdr_unsigned(int, *tl); if (!nd->nd_repstat) nd->nd_repstat = nfsvno_getattr(dp, &dirfor, nd, p, 0, NULL); if (create == NFSV4OPEN_CREATE) { nva.na_type = VREG; nva.na_mode = 0; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); how = fxdr_unsigned(int, *tl); switch (how) { case NFSCREATE_UNCHECKED: case NFSCREATE_GUARDED: error = nfsv4_sattr(nd, NULL, &nva, &attrbits, aclp, p); if (error) goto nfsmout; /* * If the na_gid being set is the same as that of * the directory it is going in, clear it, since * that is what will be set by default. This allows * a user that isn't in that group to do the create. */ if (!nd->nd_repstat && NFSVNO_ISSETGID(&nva) && nva.na_gid == dirfor.na_gid) NFSVNO_UNSET(&nva, gid); if (!nd->nd_repstat) nd->nd_repstat = nfsrv_checkuidgid(nd, &nva); break; case NFSCREATE_EXCLUSIVE: NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF); cverf[0] = *tl++; cverf[1] = *tl; break; case NFSCREATE_EXCLUSIVE41: NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF); cverf[0] = *tl++; cverf[1] = *tl; error = nfsv4_sattr(nd, NULL, &nva, &attrbits, aclp, p); if (error != 0) goto nfsmout; if (NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET)) nd->nd_repstat = NFSERR_INVAL; /* * If the na_gid being set is the same as that of * the directory it is going in, clear it, since * that is what will be set by default. This allows * a user that isn't in that group to do the create. */ if (nd->nd_repstat == 0 && NFSVNO_ISSETGID(&nva) && nva.na_gid == dirfor.na_gid) NFSVNO_UNSET(&nva, gid); if (nd->nd_repstat == 0) nd->nd_repstat = nfsrv_checkuidgid(nd, &nva); break; default: nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } } else if (create != NFSV4OPEN_NOCREATE) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } /* * Now, handle the claim, which usually includes looking up a * name in the directory referenced by dp. The exception is * NFSV4OPEN_CLAIMPREVIOUS. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); claim = fxdr_unsigned(int, *tl); if (claim == NFSV4OPEN_CLAIMDELEGATECUR || claim == NFSV4OPEN_CLAIMDELEGATECURFH) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); NFSBCOPY((caddr_t)tl,(caddr_t)stateid.other,NFSX_STATEIDOTHER); stp->ls_flags |= NFSLCK_DELEGCUR; } else if (claim == NFSV4OPEN_CLAIMDELEGATEPREV || claim == NFSV4OPEN_CLAIMDELEGATEPREVFH) { stp->ls_flags |= NFSLCK_DELEGPREV; } if (claim == NFSV4OPEN_CLAIMNULL || claim == NFSV4OPEN_CLAIMDELEGATECUR || claim == NFSV4OPEN_CLAIMDELEGATEPREV) { if (!nd->nd_repstat && create == NFSV4OPEN_CREATE && claim != NFSV4OPEN_CLAIMNULL) nd->nd_repstat = NFSERR_INVAL; if (nd->nd_repstat) { nd->nd_repstat = nfsrv_opencheck(clientid, &stateid, stp, NULL, nd, p, nd->nd_repstat); goto nfsmout; } if (create == NFSV4OPEN_CREATE) NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, CREATE, LOCKPARENT | LOCKLEAF | SAVESTART | NOCACHE); else NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, LOOKUP, LOCKLEAF | SAVESTART); nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (error) { vrele(dp); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif free(stp, M_NFSDSTATE); nfsvno_relpathbuf(&named); NFSEXITCODE2(error, nd); return (error); } if (!nd->nd_repstat) { nd->nd_repstat = nfsvno_namei(nd, &named, dp, 0, exp, &dirp); } else { vrele(dp); nfsvno_relpathbuf(&named); } if (create == NFSV4OPEN_CREATE) { switch (how) { case NFSCREATE_UNCHECKED: if (named.ni_vp) { /* * Clear the setable attribute bits, except * for Size, if it is being truncated. */ NFSZERO_ATTRBIT(&attrbits); if (NFSVNO_ISSETSIZE(&nva)) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE); } break; case NFSCREATE_GUARDED: if (named.ni_vp && !nd->nd_repstat) nd->nd_repstat = EEXIST; break; case NFSCREATE_EXCLUSIVE: exclusive_flag = 1; if (!named.ni_vp) nva.na_mode = 0; break; case NFSCREATE_EXCLUSIVE41: exclusive_flag = 1; break; } } nfsvno_open(nd, &named, clientid, &stateid, stp, &exclusive_flag, &nva, cverf, create, aclp, &attrbits, nd->nd_cred, exp, &vp); } else if (claim == NFSV4OPEN_CLAIMPREVIOUS || claim == NFSV4OPEN_CLAIMFH || claim == NFSV4OPEN_CLAIMDELEGATECURFH || claim == NFSV4OPEN_CLAIMDELEGATEPREVFH) { if (claim == NFSV4OPEN_CLAIMPREVIOUS) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); switch (i) { case NFSV4OPEN_DELEGATEREAD: stp->ls_flags |= NFSLCK_DELEGREAD; break; case NFSV4OPEN_DELEGATEWRITE: stp->ls_flags |= NFSLCK_DELEGWRITE; case NFSV4OPEN_DELEGATENONE: break; default: nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } stp->ls_flags |= NFSLCK_RECLAIM; } else { if (nd->nd_repstat == 0 && create == NFSV4OPEN_CREATE) nd->nd_repstat = NFSERR_INVAL; } vp = dp; NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY); if (!VN_IS_DOOMED(vp)) nd->nd_repstat = nfsrv_opencheck(clientid, &stateid, stp, vp, nd, p, nd->nd_repstat); else nd->nd_repstat = NFSERR_PERM; } else { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } /* * Do basic access checking. */ if (!nd->nd_repstat && vp->v_type != VREG) { /* * The IETF working group decided that this is the correct * error return for all non-regular files. */ nd->nd_repstat = (vp->v_type == VDIR) ? NFSERR_ISDIR : NFSERR_SYMLINK; } /* * If the Open is being done for a file that already exists, apply * normal permission checking including for the file owner, if * vfs.nfsd.v4openaccess is set. * Previously, the owner was always allowed to open the file to * be consistent with the NFS tradition of always allowing the * owner of the file to write to the file regardless of permissions. * It now appears that the Linux client expects the owner * permissions to be checked for opens that are not creating the * file. I believe the correct approach is to use the Access * operation's results to be consistent with NFSv3, but that is * not what the current Linux client appears to be doing. * Since both the Linux and OpenSolaris NFSv4 servers do this check, * I have enabled it by default. Since Linux does not apply this * check for claim_delegate_cur, this code does the same. * If this semantic change causes a problem, it can be disabled by * setting the sysctl vfs.nfsd.v4openaccess to 0 to re-enable the * previous semantics. */ if (nfsrv_openaccess && create == NFSV4OPEN_NOCREATE && (stp->ls_flags & NFSLCK_DELEGCUR) == 0) override = NFSACCCHK_NOOVERRIDE; else override = NFSACCCHK_ALLOWOWNER; if (!nd->nd_repstat && (stp->ls_flags & NFSLCK_WRITEACCESS)) nd->nd_repstat = nfsvno_accchk(vp, VWRITE, nd->nd_cred, exp, p, override, NFSACCCHK_VPISLOCKED, NULL); if (!nd->nd_repstat && (stp->ls_flags & NFSLCK_READACCESS)) { nd->nd_repstat = nfsvno_accchk(vp, VREAD, nd->nd_cred, exp, p, override, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat) nd->nd_repstat = nfsvno_accchk(vp, VEXEC, nd->nd_cred, exp, p, override, NFSACCCHK_VPISLOCKED, NULL); } if (!nd->nd_repstat) { nd->nd_repstat = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); if (!nd->nd_repstat) { tverf[0] = nva.na_atime.tv_sec; tverf[1] = nva.na_atime.tv_nsec; } } if (!nd->nd_repstat && exclusive_flag && (cverf[0] != tverf[0] || cverf[1] != tverf[1])) nd->nd_repstat = EEXIST; /* * Do the open locking/delegation stuff. */ if (!nd->nd_repstat) nd->nd_repstat = nfsrv_openctrl(nd, vp, &stp, clientid, &stateid, &delegstateid, &rflags, exp, p, nva.na_filerev); /* * vp must be unlocked before the call to nfsvno_getattr(dirp,...) * below, to avoid a deadlock with the lookup in nfsvno_namei() above. * (ie: Leave the NFSVOPUNLOCK() about here.) */ if (vp) NFSVOPUNLOCK(vp); if (stp) free(stp, M_NFSDSTATE); if (!nd->nd_repstat && dirp) nd->nd_repstat = nfsvno_getattr(dirp, &diraft, nd, p, 0, NULL); if (!nd->nd_repstat) { /* For NFSv4.1, set the Current StateID. */ if ((nd->nd_flag & ND_NFSV41) != 0) { nd->nd_curstateid = stateid; nd->nd_flag |= ND_CURSTATEID; } NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(stateid.seqid); NFSBCOPY((caddr_t)stateid.other,(caddr_t)tl,NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); if (claim == NFSV4OPEN_CLAIMPREVIOUS) { *tl++ = newnfs_true; *tl++ = 0; *tl++ = 0; *tl++ = 0; *tl++ = 0; } else { *tl++ = newnfs_false; /* Since dirp is not locked */ txdr_hyper(dirfor.na_filerev, tl); tl += 2; txdr_hyper(diraft.na_filerev, tl); tl += 2; } *tl = txdr_unsigned(rflags & NFSV4OPEN_RFLAGS); (void) nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (rflags & NFSV4OPEN_READDELEGATE) *tl = txdr_unsigned(NFSV4OPEN_DELEGATEREAD); else if (rflags & NFSV4OPEN_WRITEDELEGATE) *tl = txdr_unsigned(NFSV4OPEN_DELEGATEWRITE); else if (retext != 0) { *tl = txdr_unsigned(NFSV4OPEN_DELEGATENONEEXT); if ((rflags & NFSV4OPEN_WDNOTWANTED) != 0) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_NOTWANTED); } else if ((rflags & NFSV4OPEN_WDSUPPFTYPE) != 0) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_NOTSUPPFTYPE); } else if ((rflags & NFSV4OPEN_WDCONTENTION) != 0) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_CONTENTION); *tl = newnfs_false; } else if ((rflags & NFSV4OPEN_WDRESOURCE) != 0) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_RESOURCE); *tl = newnfs_false; } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_NOTWANTED); } } else *tl = txdr_unsigned(NFSV4OPEN_DELEGATENONE); if (rflags & (NFSV4OPEN_READDELEGATE|NFSV4OPEN_WRITEDELEGATE)) { NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID+NFSX_UNSIGNED); *tl++ = txdr_unsigned(delegstateid.seqid); NFSBCOPY((caddr_t)delegstateid.other, (caddr_t)tl, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); if (rflags & NFSV4OPEN_RECALL) *tl = newnfs_true; else *tl = newnfs_false; if (rflags & NFSV4OPEN_WRITEDELEGATE) { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_LIMITSIZE); txdr_hyper(nva.na_size, tl); } NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4ACE_ALLOWEDTYPE); *tl++ = txdr_unsigned(0x0); acemask = NFSV4ACE_ALLFILESMASK; if (nva.na_mode & S_IRUSR) acemask |= NFSV4ACE_READMASK; if (nva.na_mode & S_IWUSR) acemask |= NFSV4ACE_WRITEMASK; if (nva.na_mode & S_IXUSR) acemask |= NFSV4ACE_EXECUTEMASK; *tl = txdr_unsigned(acemask); (void) nfsm_strtom(nd, "OWNER@", 6); } *vpp = vp; } else if (vp) { vrele(vp); } if (dirp) vrele(dirp); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif NFSEXITCODE2(0, nd); return (0); nfsmout: vrele(dp); #ifdef NFS4_ACL_EXTATTR_NAME acl_free(aclp); #endif if (stp) free(stp, M_NFSDSTATE); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 close service */ int nfsrvd_close(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; struct nfsstate st, *stp = &st; int error = 0, writeacc; nfsv4stateid_t stateid; nfsquad_t clientid; struct nfsvattr na; struct thread *p = curthread; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID); stp->ls_seq = fxdr_unsigned(u_int32_t, *tl++); stp->ls_ownerlen = 0; stp->ls_op = nd->nd_rp; stp->ls_uid = nd->nd_cred->cr_uid; stp->ls_stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); NFSBCOPY((caddr_t)tl, (caddr_t)stp->ls_stateid.other, NFSX_STATEIDOTHER); /* * For the special stateid of other all 0s and seqid == 1, set the * stateid to the current stateid, if it is set. */ if ((nd->nd_flag & ND_NFSV41) != 0 && stp->ls_stateid.seqid == 1 && stp->ls_stateid.other[0] == 0 && stp->ls_stateid.other[1] == 0 && stp->ls_stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) stp->ls_stateid = nd->nd_curstateid; else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } stp->ls_flags = NFSLCK_CLOSE; clientid.lval[0] = stp->ls_stateid.other[0]; clientid.lval[1] = stp->ls_stateid.other[1]; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK8 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } nd->nd_repstat = nfsrv_openupdate(vp, stp, clientid, &stateid, nd, p, &writeacc); /* For pNFS, update the attributes. */ if (writeacc != 0 || nfsrv_pnfsatime != 0) nfsrv_updatemdsattr(vp, &na, p); vput(vp); if (!nd->nd_repstat) { /* * If the stateid that has been closed is the current stateid, * unset it. */ if ((nd->nd_flag & ND_CURSTATEID) != 0 && stateid.other[0] == nd->nd_curstateid.other[0] && stateid.other[1] == nd->nd_curstateid.other[1] && stateid.other[2] == nd->nd_curstateid.other[2]) nd->nd_flag &= ~ND_CURSTATEID; NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); *tl++ = txdr_unsigned(stateid.seqid); NFSBCOPY((caddr_t)stateid.other,(caddr_t)tl,NFSX_STATEIDOTHER); } NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 delegpurge service */ int nfsrvd_delegpurge(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; int error = 0; nfsquad_t clientid; struct thread *p = curthread; if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); clientid.lval[0] = *tl++; clientid.lval[1] = *tl; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK9 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } nd->nd_repstat = nfsrv_delegupdate(nd, clientid, NULL, NULL, NFSV4OP_DELEGPURGE, nd->nd_cred, p, NULL); nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 delegreturn service */ int nfsrvd_delegreturn(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; int error = 0, writeacc; nfsv4stateid_t stateid; nfsquad_t clientid; struct nfsvattr na; struct thread *p = curthread; NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); NFSBCOPY((caddr_t)tl, (caddr_t)stateid.other, NFSX_STATEIDOTHER); clientid.lval[0] = stateid.other[0]; clientid.lval[1] = stateid.other[1]; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK10 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } nd->nd_repstat = nfsrv_delegupdate(nd, clientid, &stateid, vp, NFSV4OP_DELEGRETURN, nd->nd_cred, p, &writeacc); /* For pNFS, update the attributes. */ if (writeacc != 0 || nfsrv_pnfsatime != 0) nfsrv_updatemdsattr(vp, &na, p); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 get file handle service */ int nfsrvd_getfh(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { fhandle_t fh; struct thread *p = curthread; nd->nd_repstat = nfsvno_getfh(vp, &fh, p); vput(vp); if (!nd->nd_repstat) (void) nfsm_fhtom(nd, (u_int8_t *)&fh, 0, 0); NFSEXITCODE2(0, nd); return (0); } /* * nfsv4 open confirm service */ int nfsrvd_openconfirm(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; struct nfsstate st, *stp = &st; int error = 0; nfsv4stateid_t stateid; nfsquad_t clientid; struct thread *p = curthread; if ((nd->nd_flag & ND_NFSV41) != 0) { nd->nd_repstat = NFSERR_NOTSUPP; goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); stp->ls_ownerlen = 0; stp->ls_op = nd->nd_rp; stp->ls_uid = nd->nd_cred->cr_uid; stp->ls_stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); NFSBCOPY((caddr_t)tl, (caddr_t)stp->ls_stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); stp->ls_seq = fxdr_unsigned(u_int32_t, *tl); stp->ls_flags = NFSLCK_CONFIRM; clientid.lval[0] = stp->ls_stateid.other[0]; clientid.lval[1] = stp->ls_stateid.other[1]; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK11 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } nd->nd_repstat = nfsrv_openupdate(vp, stp, clientid, &stateid, nd, p, NULL); if (!nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); *tl++ = txdr_unsigned(stateid.seqid); NFSBCOPY((caddr_t)stateid.other,(caddr_t)tl,NFSX_STATEIDOTHER); } nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 open downgrade service */ int nfsrvd_opendowngrade(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; int i; struct nfsstate st, *stp = &st; int error = 0; nfsv4stateid_t stateid; nfsquad_t clientid; struct thread *p = curthread; /* opendowngrade can only work on a file object.*/ if (vp->v_type != VREG) { error = NFSERR_INVAL; goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 3 * NFSX_UNSIGNED); stp->ls_ownerlen = 0; stp->ls_op = nd->nd_rp; stp->ls_uid = nd->nd_cred->cr_uid; stp->ls_stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); NFSBCOPY((caddr_t)tl, (caddr_t)stp->ls_stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); /* * For the special stateid of other all 0s and seqid == 1, set the * stateid to the current stateid, if it is set. */ if ((nd->nd_flag & ND_NFSV41) != 0 && stp->ls_stateid.seqid == 1 && stp->ls_stateid.other[0] == 0 && stp->ls_stateid.other[1] == 0 && stp->ls_stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) stp->ls_stateid = nd->nd_curstateid; else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } stp->ls_seq = fxdr_unsigned(u_int32_t, *tl++); i = fxdr_unsigned(int, *tl++); if ((nd->nd_flag & ND_NFSV41) != 0) i &= ~NFSV4OPEN_WANTDELEGMASK; switch (i) { case NFSV4OPEN_ACCESSREAD: stp->ls_flags = (NFSLCK_READACCESS | NFSLCK_DOWNGRADE); break; case NFSV4OPEN_ACCESSWRITE: stp->ls_flags = (NFSLCK_WRITEACCESS | NFSLCK_DOWNGRADE); break; case NFSV4OPEN_ACCESSBOTH: stp->ls_flags = (NFSLCK_READACCESS | NFSLCK_WRITEACCESS | NFSLCK_DOWNGRADE); break; default: nd->nd_repstat = NFSERR_INVAL; } i = fxdr_unsigned(int, *tl); switch (i) { case NFSV4OPEN_DENYNONE: break; case NFSV4OPEN_DENYREAD: stp->ls_flags |= NFSLCK_READDENY; break; case NFSV4OPEN_DENYWRITE: stp->ls_flags |= NFSLCK_WRITEDENY; break; case NFSV4OPEN_DENYBOTH: stp->ls_flags |= (NFSLCK_READDENY | NFSLCK_WRITEDENY); break; default: nd->nd_repstat = NFSERR_INVAL; } clientid.lval[0] = stp->ls_stateid.other[0]; clientid.lval[1] = stp->ls_stateid.other[1]; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK12 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } if (!nd->nd_repstat) nd->nd_repstat = nfsrv_openupdate(vp, stp, clientid, &stateid, nd, p, NULL); if (!nd->nd_repstat) { /* For NFSv4.1, set the Current StateID. */ if ((nd->nd_flag & ND_NFSV41) != 0) { nd->nd_curstateid = stateid; nd->nd_flag |= ND_CURSTATEID; } NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); *tl++ = txdr_unsigned(stateid.seqid); NFSBCOPY((caddr_t)stateid.other,(caddr_t)tl,NFSX_STATEIDOTHER); } nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 renew lease service */ int nfsrvd_renew(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; int error = 0; nfsquad_t clientid; struct thread *p = curthread; if ((nd->nd_flag & ND_NFSV41) != 0) { nd->nd_repstat = NFSERR_NOTSUPP; goto nfsmout; } if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); clientid.lval[0] = *tl++; clientid.lval[1] = *tl; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK13 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } nd->nd_repstat = nfsrv_getclient(clientid, (CLOPS_RENEWOP|CLOPS_RENEW), NULL, NULL, (nfsquad_t)((u_quad_t)0), 0, nd, p); nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 security info service */ int nfsrvd_secinfo(struct nfsrv_descript *nd, int isdgram, vnode_t dp, struct nfsexstuff *exp) { u_int32_t *tl; int len; struct nameidata named; vnode_t dirp = NULL, vp; struct nfsrvfh fh; struct nfsexstuff retnes; u_int32_t *sizp; int error = 0, i; uint64_t savflag; char *bufp; u_long *hashp; struct thread *p = curthread; /* * All this just to get the export flags for the name. */ NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, LOOKUP, LOCKLEAF | SAVESTART); nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (error) { vput(dp); nfsvno_relpathbuf(&named); goto out; } if (!nd->nd_repstat) { nd->nd_repstat = nfsvno_namei(nd, &named, dp, 1, exp, &dirp); } else { vput(dp); nfsvno_relpathbuf(&named); } if (dirp) vrele(dirp); if (nd->nd_repstat) goto out; vrele(named.ni_startdir); nfsvno_relpathbuf(&named); fh.nfsrvfh_len = NFSX_MYFH; vp = named.ni_vp; nd->nd_repstat = nfsvno_getfh(vp, (fhandle_t *)fh.nfsrvfh_data, p); vput(vp); savflag = nd->nd_flag; if (!nd->nd_repstat) { /* * Pretend the next op is Secinfo, so that no wrongsec * test will be done. */ nfsd_fhtovp(nd, &fh, LK_SHARED, &vp, &retnes, NULL, 0, NFSV4OP_SECINFO); if (vp) vput(vp); } nd->nd_flag = savflag; if (nd->nd_repstat) goto out; /* * Finally have the export flags for name, so we can create * the security info. */ len = 0; NFSM_BUILD(sizp, u_int32_t *, NFSX_UNSIGNED); /* If nes_numsecflavor == 0, all are allowed. */ if (retnes.nes_numsecflavor == 0) { NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(RPCAUTH_UNIX); *tl = txdr_unsigned(RPCAUTH_GSS); nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, uint32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl++ = txdr_unsigned(RPCAUTHGSS_SVCNONE); *tl = txdr_unsigned(RPCAUTH_GSS); nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, uint32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl++ = txdr_unsigned(RPCAUTHGSS_SVCINTEGRITY); *tl = txdr_unsigned(RPCAUTH_GSS); nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl = txdr_unsigned(RPCAUTHGSS_SVCPRIVACY); len = 4; } for (i = 0; i < retnes.nes_numsecflavor; i++) { if (retnes.nes_secflavors[i] == AUTH_SYS) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(RPCAUTH_UNIX); len++; } else if (retnes.nes_secflavors[i] == RPCSEC_GSS_KRB5) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl++ = txdr_unsigned(RPCAUTH_GSS); (void) nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl = txdr_unsigned(RPCAUTHGSS_SVCNONE); len++; } else if (retnes.nes_secflavors[i] == RPCSEC_GSS_KRB5I) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl++ = txdr_unsigned(RPCAUTH_GSS); (void) nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl = txdr_unsigned(RPCAUTHGSS_SVCINTEGRITY); len++; } else if (retnes.nes_secflavors[i] == RPCSEC_GSS_KRB5P) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl++ = txdr_unsigned(RPCAUTH_GSS); (void) nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl = txdr_unsigned(RPCAUTHGSS_SVCPRIVACY); len++; } } *sizp = txdr_unsigned(len); out: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 security info no name service */ int nfsrvd_secinfononame(struct nfsrv_descript *nd, int isdgram, vnode_t dp, struct nfsexstuff *exp) { uint32_t *tl, *sizp; struct nameidata named; vnode_t dirp = NULL, vp; struct nfsrvfh fh; struct nfsexstuff retnes; int error = 0, fhstyle, i, len; uint64_t savflag; char *bufp; u_long *hashp; struct thread *p = curthread; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); fhstyle = fxdr_unsigned(int, *tl); switch (fhstyle) { case NFSSECINFONONAME_PARENT: if (dp->v_type != VDIR) { vput(dp); nd->nd_repstat = NFSERR_NOTDIR; goto nfsmout; } NFSNAMEICNDSET(&named.ni_cnd, nd->nd_cred, LOOKUP, LOCKLEAF | SAVESTART); nfsvno_setpathbuf(&named, &bufp, &hashp); error = nfsrv_parsename(nd, bufp, hashp, &named.ni_pathlen); if (error != 0) { vput(dp); nfsvno_relpathbuf(&named); goto nfsmout; } if (nd->nd_repstat == 0) nd->nd_repstat = nfsvno_namei(nd, &named, dp, 1, exp, &dirp); else vput(dp); if (dirp != NULL) vrele(dirp); vrele(named.ni_startdir); nfsvno_relpathbuf(&named); vp = named.ni_vp; break; case NFSSECINFONONAME_CURFH: vp = dp; break; default: nd->nd_repstat = NFSERR_INVAL; vput(dp); } if (nd->nd_repstat != 0) goto nfsmout; fh.nfsrvfh_len = NFSX_MYFH; nd->nd_repstat = nfsvno_getfh(vp, (fhandle_t *)fh.nfsrvfh_data, p); vput(vp); savflag = nd->nd_flag; if (nd->nd_repstat == 0) { /* * Pretend the next op is Secinfo, so that no wrongsec * test will be done. */ nfsd_fhtovp(nd, &fh, LK_SHARED, &vp, &retnes, NULL, 0, NFSV4OP_SECINFO); if (vp != NULL) vput(vp); } nd->nd_flag = savflag; if (nd->nd_repstat != 0) goto nfsmout; /* * Finally have the export flags for fh/parent, so we can create * the security info. */ len = 0; NFSM_BUILD(sizp, uint32_t *, NFSX_UNSIGNED); /* If nes_numsecflavor == 0, all are allowed. */ if (retnes.nes_numsecflavor == 0) { NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(RPCAUTH_UNIX); *tl = txdr_unsigned(RPCAUTH_GSS); nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, uint32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl++ = txdr_unsigned(RPCAUTHGSS_SVCNONE); *tl = txdr_unsigned(RPCAUTH_GSS); nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, uint32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl++ = txdr_unsigned(RPCAUTHGSS_SVCINTEGRITY); *tl = txdr_unsigned(RPCAUTH_GSS); nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl = txdr_unsigned(RPCAUTHGSS_SVCPRIVACY); len = 4; } for (i = 0; i < retnes.nes_numsecflavor; i++) { if (retnes.nes_secflavors[i] == AUTH_SYS) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(RPCAUTH_UNIX); len++; } else if (retnes.nes_secflavors[i] == RPCSEC_GSS_KRB5) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(RPCAUTH_GSS); nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl = txdr_unsigned(RPCAUTHGSS_SVCNONE); len++; } else if (retnes.nes_secflavors[i] == RPCSEC_GSS_KRB5I) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(RPCAUTH_GSS); nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl = txdr_unsigned(RPCAUTHGSS_SVCINTEGRITY); len++; } else if (retnes.nes_secflavors[i] == RPCSEC_GSS_KRB5P) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(RPCAUTH_GSS); nfsm_strtom(nd, nfsgss_mechlist[KERBV_MECH].str, nfsgss_mechlist[KERBV_MECH].len); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(GSS_KERBV_QOP); *tl = txdr_unsigned(RPCAUTHGSS_SVCPRIVACY); len++; } } *sizp = txdr_unsigned(len); nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 set client id service */ int nfsrvd_setclientid(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; int i; int error = 0, idlen; struct nfsclient *clp = NULL; #ifdef INET struct sockaddr_in *rin; #endif #ifdef INET6 struct sockaddr_in6 *rin6; #endif #if defined(INET) || defined(INET6) u_char *ucp, *ucp2; #endif u_char *verf, *addrbuf; nfsquad_t clientid, confirm; struct thread *p = curthread; if ((nd->nd_flag & ND_NFSV41) != 0) { nd->nd_repstat = NFSERR_NOTSUPP; goto nfsmout; } if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto out; NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF + NFSX_UNSIGNED); verf = (u_char *)tl; tl += (NFSX_VERF / NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); if (i > NFSV4_OPAQUELIMIT || i <= 0) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } idlen = i; if (nd->nd_flag & ND_GSS) i += nd->nd_princlen; clp = malloc(sizeof(struct nfsclient) + i, M_NFSDCLIENT, M_WAITOK | M_ZERO); clp->lc_stateid = malloc(sizeof(struct nfsstatehead) * nfsrv_statehashsize, M_NFSDCLIENT, M_WAITOK); NFSINITSOCKMUTEX(&clp->lc_req.nr_mtx); /* Allocated large enough for an AF_INET or AF_INET6 socket. */ clp->lc_req.nr_nam = malloc(sizeof(struct sockaddr_in6), M_SONAME, M_WAITOK | M_ZERO); clp->lc_req.nr_cred = NULL; NFSBCOPY(verf, clp->lc_verf, NFSX_VERF); clp->lc_idlen = idlen; error = nfsrv_mtostr(nd, clp->lc_id, idlen); if (error) goto nfsmout; if (nd->nd_flag & ND_GSS) { clp->lc_flags = LCL_GSS; if (nd->nd_flag & ND_GSSINTEGRITY) clp->lc_flags |= LCL_GSSINTEGRITY; else if (nd->nd_flag & ND_GSSPRIVACY) clp->lc_flags |= LCL_GSSPRIVACY; } else { clp->lc_flags = 0; } if ((nd->nd_flag & ND_GSS) && nd->nd_princlen > 0) { clp->lc_flags |= LCL_NAME; clp->lc_namelen = nd->nd_princlen; clp->lc_name = &clp->lc_id[idlen]; NFSBCOPY(nd->nd_principal, clp->lc_name, clp->lc_namelen); } else { clp->lc_uid = nd->nd_cred->cr_uid; clp->lc_gid = nd->nd_cred->cr_gid; } /* If the client is using TLS, do so for the callback connection. */ if (nd->nd_flag & ND_TLS) clp->lc_flags |= LCL_TLSCB; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); clp->lc_program = fxdr_unsigned(u_int32_t, *tl); error = nfsrv_getclientipaddr(nd, clp); if (error) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); clp->lc_callback = fxdr_unsigned(u_int32_t, *tl); /* * nfsrv_setclient() does the actual work of adding it to the * client list. If there is no error, the structure has been * linked into the client list and clp should no longer be used * here. When an error is returned, it has not been linked in, * so it should be free'd. */ nd->nd_repstat = nfsrv_setclient(nd, &clp, &clientid, &confirm, p); if (nd->nd_repstat == NFSERR_CLIDINUSE) { /* * 8 is the maximum length of the port# string. */ addrbuf = malloc(INET6_ADDRSTRLEN + 8, M_TEMP, M_WAITOK); switch (clp->lc_req.nr_nam->sa_family) { #ifdef INET case AF_INET: if (clp->lc_flags & LCL_TCPCALLBACK) (void) nfsm_strtom(nd, "tcp", 3); else (void) nfsm_strtom(nd, "udp", 3); rin = (struct sockaddr_in *)clp->lc_req.nr_nam; ucp = (u_char *)&rin->sin_addr.s_addr; ucp2 = (u_char *)&rin->sin_port; sprintf(addrbuf, "%d.%d.%d.%d.%d.%d", ucp[0] & 0xff, ucp[1] & 0xff, ucp[2] & 0xff, ucp[3] & 0xff, ucp2[0] & 0xff, ucp2[1] & 0xff); break; #endif #ifdef INET6 case AF_INET6: if (clp->lc_flags & LCL_TCPCALLBACK) (void) nfsm_strtom(nd, "tcp6", 4); else (void) nfsm_strtom(nd, "udp6", 4); rin6 = (struct sockaddr_in6 *)clp->lc_req.nr_nam; ucp = inet_ntop(AF_INET6, &rin6->sin6_addr, addrbuf, INET6_ADDRSTRLEN); if (ucp != NULL) i = strlen(ucp); else i = 0; ucp2 = (u_char *)&rin6->sin6_port; sprintf(&addrbuf[i], ".%d.%d", ucp2[0] & 0xff, ucp2[1] & 0xff); break; #endif } (void) nfsm_strtom(nd, addrbuf, strlen(addrbuf)); free(addrbuf, M_TEMP); } if (clp) { free(clp->lc_req.nr_nam, M_SONAME); NFSFREEMUTEX(&clp->lc_req.nr_mtx); free(clp->lc_stateid, M_NFSDCLIENT); free(clp, M_NFSDCLIENT); } if (!nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_HYPER); *tl++ = clientid.lval[0]; *tl++ = clientid.lval[1]; *tl++ = confirm.lval[0]; *tl = confirm.lval[1]; } out: NFSEXITCODE2(0, nd); return (0); nfsmout: if (clp) { free(clp->lc_req.nr_nam, M_SONAME); NFSFREEMUTEX(&clp->lc_req.nr_mtx); free(clp->lc_stateid, M_NFSDCLIENT); free(clp, M_NFSDCLIENT); } NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 set client id confirm service */ int nfsrvd_setclientidcfrm(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; int error = 0; nfsquad_t clientid, confirm; struct thread *p = curthread; if ((nd->nd_flag & ND_NFSV41) != 0) { nd->nd_repstat = NFSERR_NOTSUPP; goto nfsmout; } if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_HYPER); clientid.lval[0] = *tl++; clientid.lval[1] = *tl++; confirm.lval[0] = *tl++; confirm.lval[1] = *tl; /* * nfsrv_getclient() searches the client list for a match and * returns the appropriate NFSERR status. */ nd->nd_repstat = nfsrv_getclient(clientid, (CLOPS_CONFIRM|CLOPS_RENEW), NULL, NULL, confirm, 0, nd, p); nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 verify service */ int nfsrvd_verify(struct nfsrv_descript *nd, int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { int error = 0, ret, fhsize = NFSX_MYFH; struct nfsvattr nva; struct statfs *sf; struct nfsfsinfo fs; fhandle_t fh; struct thread *p = curthread; sf = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); nd->nd_repstat = nfsvno_getattr(vp, &nva, nd, p, 1, NULL); if (!nd->nd_repstat) nd->nd_repstat = nfsvno_statfs(vp, sf); if (!nd->nd_repstat) nd->nd_repstat = nfsvno_getfh(vp, &fh, p); if (!nd->nd_repstat) { nfsvno_getfs(&fs, isdgram); error = nfsv4_loadattr(nd, vp, &nva, NULL, &fh, fhsize, NULL, sf, NULL, &fs, NULL, 1, &ret, NULL, NULL, p, nd->nd_cred); if (!error) { if (nd->nd_procnum == NFSV4OP_NVERIFY) { if (ret == 0) nd->nd_repstat = NFSERR_SAME; else if (ret != NFSERR_NOTSAME) nd->nd_repstat = ret; } else if (ret) nd->nd_repstat = ret; } } vput(vp); free(sf, M_STATFS); NFSEXITCODE2(error, nd); return (error); } /* * nfs openattr rpc */ int nfsrvd_openattr(struct nfsrv_descript *nd, __unused int isdgram, vnode_t dp, __unused vnode_t *vpp, __unused fhandle_t *fhp, __unused struct nfsexstuff *exp) { u_int32_t *tl; int error = 0, createdir __unused; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); createdir = fxdr_unsigned(int, *tl); nd->nd_repstat = NFSERR_NOTSUPP; nfsmout: vrele(dp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 release lock owner service */ int nfsrvd_releaselckown(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { u_int32_t *tl; struct nfsstate *stp = NULL; int error = 0, len; nfsquad_t clientid; struct thread *p = curthread; if ((nd->nd_flag & ND_NFSV41) != 0) { nd->nd_repstat = NFSERR_NOTSUPP; goto nfsmout; } if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); len = fxdr_unsigned(int, *(tl + 2)); if (len <= 0 || len > NFSV4_OPAQUELIMIT) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } stp = malloc(sizeof (struct nfsstate) + len, M_NFSDSTATE, M_WAITOK); stp->ls_ownerlen = len; stp->ls_op = NULL; stp->ls_flags = NFSLCK_RELEASE; stp->ls_uid = nd->nd_cred->cr_uid; clientid.lval[0] = *tl++; clientid.lval[1] = *tl; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK14 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } error = nfsrv_mtostr(nd, stp->ls_owner, len); if (error) goto nfsmout; nd->nd_repstat = nfsrv_releaselckown(stp, clientid, p); free(stp, M_NFSDSTATE); NFSEXITCODE2(0, nd); return (0); nfsmout: if (stp) free(stp, M_NFSDSTATE); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 exchange_id service */ int nfsrvd_exchangeid(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; int error = 0, i, idlen; struct nfsclient *clp = NULL; nfsquad_t clientid, confirm; uint8_t *verf; uint32_t sp4type, v41flags; struct timespec verstime; #ifdef INET struct sockaddr_in *sin, *rin; #endif #ifdef INET6 struct sockaddr_in6 *sin6, *rin6; #endif struct thread *p = curthread; char *s; if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF + NFSX_UNSIGNED); verf = (uint8_t *)tl; tl += (NFSX_VERF / NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); if (i > NFSV4_OPAQUELIMIT || i <= 0) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } idlen = i; if (nd->nd_flag & ND_GSS) i += nd->nd_princlen; clp = malloc(sizeof(struct nfsclient) + i, M_NFSDCLIENT, M_WAITOK | M_ZERO); clp->lc_stateid = malloc(sizeof(struct nfsstatehead) * nfsrv_statehashsize, M_NFSDCLIENT, M_WAITOK); NFSINITSOCKMUTEX(&clp->lc_req.nr_mtx); /* Allocated large enough for an AF_INET or AF_INET6 socket. */ clp->lc_req.nr_nam = malloc(sizeof(struct sockaddr_in6), M_SONAME, M_WAITOK | M_ZERO); switch (nd->nd_nam->sa_family) { #ifdef INET case AF_INET: rin = (struct sockaddr_in *)clp->lc_req.nr_nam; sin = (struct sockaddr_in *)nd->nd_nam; rin->sin_family = AF_INET; rin->sin_len = sizeof(struct sockaddr_in); rin->sin_port = 0; rin->sin_addr.s_addr = sin->sin_addr.s_addr; break; #endif #ifdef INET6 case AF_INET6: rin6 = (struct sockaddr_in6 *)clp->lc_req.nr_nam; sin6 = (struct sockaddr_in6 *)nd->nd_nam; rin6->sin6_family = AF_INET6; rin6->sin6_len = sizeof(struct sockaddr_in6); rin6->sin6_port = 0; rin6->sin6_addr = sin6->sin6_addr; break; #endif } clp->lc_req.nr_cred = NULL; NFSBCOPY(verf, clp->lc_verf, NFSX_VERF); clp->lc_idlen = idlen; error = nfsrv_mtostr(nd, clp->lc_id, idlen); if (error != 0) goto nfsmout; if ((nd->nd_flag & ND_GSS) != 0) { clp->lc_flags = LCL_GSS | LCL_NFSV41; if ((nd->nd_flag & ND_GSSINTEGRITY) != 0) clp->lc_flags |= LCL_GSSINTEGRITY; else if ((nd->nd_flag & ND_GSSPRIVACY) != 0) clp->lc_flags |= LCL_GSSPRIVACY; } else clp->lc_flags = LCL_NFSV41; if ((nd->nd_flag & ND_NFSV42) != 0) clp->lc_flags |= LCL_NFSV42; if ((nd->nd_flag & ND_GSS) != 0 && nd->nd_princlen > 0) { clp->lc_flags |= LCL_NAME; clp->lc_namelen = nd->nd_princlen; clp->lc_name = &clp->lc_id[idlen]; NFSBCOPY(nd->nd_principal, clp->lc_name, clp->lc_namelen); } else { clp->lc_uid = nd->nd_cred->cr_uid; clp->lc_gid = nd->nd_cred->cr_gid; } NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); v41flags = fxdr_unsigned(uint32_t, *tl++); if ((v41flags & ~(NFSV4EXCH_SUPPMOVEDREFER | NFSV4EXCH_SUPPMOVEDMIGR | NFSV4EXCH_BINDPRINCSTATEID | NFSV4EXCH_MASKPNFS | NFSV4EXCH_UPDCONFIRMEDRECA)) != 0) { nd->nd_repstat = NFSERR_INVAL; goto nfsmout; } if ((v41flags & NFSV4EXCH_UPDCONFIRMEDRECA) != 0) confirm.lval[1] = 1; else confirm.lval[1] = 0; if (nfsrv_devidcnt == 0) v41flags = NFSV4EXCH_USENONPNFS | NFSV4EXCH_USEPNFSDS; else v41flags = NFSV4EXCH_USEPNFSMDS; sp4type = fxdr_unsigned(uint32_t, *tl); if (sp4type != NFSV4EXCH_SP4NONE) { nd->nd_repstat = NFSERR_NOTSUPP; goto nfsmout; } /* * nfsrv_setclient() does the actual work of adding it to the * client list. If there is no error, the structure has been * linked into the client list and clp should no longer be used * here. When an error is returned, it has not been linked in, * so it should be free'd. */ nd->nd_repstat = nfsrv_setclient(nd, &clp, &clientid, &confirm, p); if (clp != NULL) { free(clp->lc_req.nr_nam, M_SONAME); NFSFREEMUTEX(&clp->lc_req.nr_mtx); free(clp->lc_stateid, M_NFSDCLIENT); free(clp, M_NFSDCLIENT); } if (nd->nd_repstat == 0) { if (confirm.lval[1] != 0) v41flags |= NFSV4EXCH_CONFIRMEDR; NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + 3 * NFSX_UNSIGNED); *tl++ = clientid.lval[0]; /* ClientID */ *tl++ = clientid.lval[1]; *tl++ = txdr_unsigned(confirm.lval[0]); /* SequenceID */ *tl++ = txdr_unsigned(v41flags); /* Exch flags */ *tl++ = txdr_unsigned(NFSV4EXCH_SP4NONE); /* No SSV */ txdr_hyper(nfsrv_owner_minor, tl); /* Owner Minor */ if (nfsrv_owner_major[0] != 0) s = nfsrv_owner_major; else s = nd->nd_cred->cr_prison->pr_hostuuid; nfsm_strtom(nd, s, strlen(s)); /* Owner Major */ if (nfsrv_scope[0] != 0) s = nfsrv_scope; else s = nd->nd_cred->cr_prison->pr_hostuuid; nfsm_strtom(nd, s, strlen(s) ); /* Scope */ NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(1); (void)nfsm_strtom(nd, "freebsd.org", strlen("freebsd.org")); (void)nfsm_strtom(nd, version, strlen(version)); NFSM_BUILD(tl, uint32_t *, NFSX_V4TIME); verstime.tv_sec = 1293840000; /* Jan 1, 2011 */ verstime.tv_nsec = 0; txdr_nfsv4time(&verstime, tl); } NFSEXITCODE2(0, nd); return (0); nfsmout: if (clp != NULL) { free(clp->lc_req.nr_nam, M_SONAME); NFSFREEMUTEX(&clp->lc_req.nr_mtx); free(clp->lc_stateid, M_NFSDCLIENT); free(clp, M_NFSDCLIENT); } NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 create session service */ int nfsrvd_createsession(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; int error = 0; nfsquad_t clientid, confirm; struct nfsdsession *sep = NULL; uint32_t rdmacnt; struct thread *p = curthread; static bool do_printf = true; if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto nfsmout; sep = (struct nfsdsession *)malloc(sizeof(struct nfsdsession), M_NFSDSESSION, M_WAITOK | M_ZERO); sep->sess_refcnt = 1; mtx_init(&sep->sess_cbsess.nfsess_mtx, "nfscbsession", NULL, MTX_DEF); NFSM_DISSECT(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); clientid.lval[0] = *tl++; clientid.lval[1] = *tl++; confirm.lval[0] = fxdr_unsigned(uint32_t, *tl++); sep->sess_crflags = fxdr_unsigned(uint32_t, *tl); /* Persistent sessions and RDMA are not supported. */ sep->sess_crflags &= NFSV4CRSESS_CONNBACKCHAN; /* Fore channel attributes. */ NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED); tl++; /* Header pad always 0. */ sep->sess_maxreq = fxdr_unsigned(uint32_t, *tl++); if (sep->sess_maxreq > sb_max_adj - NFS_MAXXDR) { sep->sess_maxreq = sb_max_adj - NFS_MAXXDR; if (do_printf) printf("Consider increasing kern.ipc.maxsockbuf\n"); do_printf = false; } sep->sess_maxresp = fxdr_unsigned(uint32_t, *tl++); if (sep->sess_maxresp > sb_max_adj - NFS_MAXXDR) { sep->sess_maxresp = sb_max_adj - NFS_MAXXDR; if (do_printf) printf("Consider increasing kern.ipc.maxsockbuf\n"); do_printf = false; } sep->sess_maxrespcached = fxdr_unsigned(uint32_t, *tl++); sep->sess_maxops = fxdr_unsigned(uint32_t, *tl++); sep->sess_maxslots = fxdr_unsigned(uint32_t, *tl++); if (sep->sess_maxslots > NFSV4_SLOTS) sep->sess_maxslots = NFSV4_SLOTS; rdmacnt = fxdr_unsigned(uint32_t, *tl); if (rdmacnt > 1) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } else if (rdmacnt == 1) NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); /* Back channel attributes. */ NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED); tl++; /* Header pad always 0. */ sep->sess_cbmaxreq = fxdr_unsigned(uint32_t, *tl++); sep->sess_cbmaxresp = fxdr_unsigned(uint32_t, *tl++); sep->sess_cbmaxrespcached = fxdr_unsigned(uint32_t, *tl++); sep->sess_cbmaxops = fxdr_unsigned(uint32_t, *tl++); sep->sess_cbsess.nfsess_foreslots = fxdr_unsigned(uint32_t, *tl++); rdmacnt = fxdr_unsigned(uint32_t, *tl); if (rdmacnt > 1) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } else if (rdmacnt == 1) NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); sep->sess_cbprogram = fxdr_unsigned(uint32_t, *tl); /* * nfsrv_getclient() searches the client list for a match and * returns the appropriate NFSERR status. */ nd->nd_repstat = nfsrv_getclient(clientid, CLOPS_CONFIRM | CLOPS_RENEW, NULL, sep, confirm, sep->sess_cbprogram, nd, p); if (nd->nd_repstat == 0) { NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID); NFSBCOPY(sep->sess_sessionid, tl, NFSX_V4SESSIONID); NFSM_BUILD(tl, uint32_t *, 18 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(confirm.lval[0]); /* sequenceid */ *tl++ = txdr_unsigned(sep->sess_crflags); /* Fore channel attributes. */ *tl++ = 0; *tl++ = txdr_unsigned(sep->sess_maxreq); *tl++ = txdr_unsigned(sep->sess_maxresp); *tl++ = txdr_unsigned(sep->sess_maxrespcached); *tl++ = txdr_unsigned(sep->sess_maxops); *tl++ = txdr_unsigned(sep->sess_maxslots); *tl++ = txdr_unsigned(1); *tl++ = txdr_unsigned(0); /* No RDMA. */ /* Back channel attributes. */ *tl++ = 0; *tl++ = txdr_unsigned(sep->sess_cbmaxreq); *tl++ = txdr_unsigned(sep->sess_cbmaxresp); *tl++ = txdr_unsigned(sep->sess_cbmaxrespcached); *tl++ = txdr_unsigned(sep->sess_cbmaxops); *tl++ = txdr_unsigned(sep->sess_cbsess.nfsess_foreslots); *tl++ = txdr_unsigned(1); *tl = txdr_unsigned(0); /* No RDMA. */ } nfsmout: if (nd->nd_repstat != 0 && sep != NULL) free(sep, M_NFSDSESSION); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 sequence service */ int nfsrvd_sequence(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; uint32_t highest_slotid, sequenceid, sflags, target_highest_slotid; int cache_this, error = 0; struct thread *p = curthread; if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto nfsmout; NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID); NFSBCOPY(tl, nd->nd_sessionid, NFSX_V4SESSIONID); NFSM_DISSECT(tl, uint32_t *, 4 * NFSX_UNSIGNED); sequenceid = fxdr_unsigned(uint32_t, *tl++); nd->nd_slotid = fxdr_unsigned(uint32_t, *tl++); highest_slotid = fxdr_unsigned(uint32_t, *tl++); if (*tl == newnfs_true) cache_this = 1; else cache_this = 0; nd->nd_repstat = nfsrv_checksequence(nd, sequenceid, &highest_slotid, &target_highest_slotid, cache_this, &sflags, p); if (nd->nd_repstat != NFSERR_BADSLOT) nd->nd_flag |= ND_HASSEQUENCE; if (nd->nd_repstat == 0) { NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID); NFSBCOPY(nd->nd_sessionid, tl, NFSX_V4SESSIONID); NFSM_BUILD(tl, uint32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(sequenceid); *tl++ = txdr_unsigned(nd->nd_slotid); *tl++ = txdr_unsigned(highest_slotid); *tl++ = txdr_unsigned(target_highest_slotid); *tl = txdr_unsigned(sflags); } nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 reclaim complete service */ int nfsrvd_reclaimcomplete(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; int error = 0, onefs; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); /* * I believe that a ReclaimComplete with rca_one_fs == TRUE is only * to be used after a file system has been transferred to a different * file server. However, RFC5661 is somewhat vague w.r.t. this and * the ESXi 6.7 client does both a ReclaimComplete with rca_one_fs * == TRUE and one with ReclaimComplete with rca_one_fs == FALSE. * Therefore, just ignore the rca_one_fs == TRUE operation and return * NFS_OK without doing anything. */ onefs = 0; if (*tl == newnfs_true) onefs = 1; nd->nd_repstat = nfsrv_checkreclaimcomplete(nd, onefs); nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 destroy clientid service */ int nfsrvd_destroyclientid(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; nfsquad_t clientid; int error = 0; struct thread *p = curthread; if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto nfsmout; NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); clientid.lval[0] = *tl++; clientid.lval[1] = *tl; nd->nd_repstat = nfsrv_destroyclient(clientid, p); nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 bind connection to session service */ int nfsrvd_bindconnsess(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; uint8_t sessid[NFSX_V4SESSIONID]; int error = 0, foreaft; if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto nfsmout; NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID + 2 * NFSX_UNSIGNED); NFSBCOPY(tl, sessid, NFSX_V4SESSIONID); tl += (NFSX_V4SESSIONID / NFSX_UNSIGNED); foreaft = fxdr_unsigned(int, *tl++); if (*tl == newnfs_true) { /* RDMA is not supported. */ nd->nd_repstat = NFSERR_NOTSUPP; goto nfsmout; } nd->nd_repstat = nfsrv_bindconnsess(nd, sessid, &foreaft); if (nd->nd_repstat == 0) { NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID + 2 * NFSX_UNSIGNED); NFSBCOPY(sessid, tl, NFSX_V4SESSIONID); tl += (NFSX_V4SESSIONID / NFSX_UNSIGNED); *tl++ = txdr_unsigned(foreaft); *tl = newnfs_false; } nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 destroy session service */ int nfsrvd_destroysession(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { uint8_t *cp, sessid[NFSX_V4SESSIONID]; int error = 0; if ((nd->nd_repstat = nfsd_checkrootexp(nd)) != 0) goto nfsmout; NFSM_DISSECT(cp, uint8_t *, NFSX_V4SESSIONID); NFSBCOPY(cp, sessid, NFSX_V4SESSIONID); nd->nd_repstat = nfsrv_destroysession(nd, sessid); nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 free stateid service */ int nfsrvd_freestateid(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; nfsv4stateid_t stateid; int error = 0; struct thread *p = curthread; NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID); stateid.seqid = fxdr_unsigned(uint32_t, *tl++); NFSBCOPY(tl, stateid.other, NFSX_STATEIDOTHER); /* * For the special stateid of other all 0s and seqid == 1, set the * stateid to the current stateid, if it is set. */ if (stateid.seqid == 1 && stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) { stateid = nd->nd_curstateid; stateid.seqid = 0; } else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } nd->nd_repstat = nfsrv_freestateid(nd, &stateid, p); /* If the current stateid has been free'd, unset it. */ if (nd->nd_repstat == 0 && (nd->nd_flag & ND_CURSTATEID) != 0 && stateid.other[0] == nd->nd_curstateid.other[0] && stateid.other[1] == nd->nd_curstateid.other[1] && stateid.other[2] == nd->nd_curstateid.other[2]) nd->nd_flag &= ~ND_CURSTATEID; nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 layoutget service */ int nfsrvd_layoutget(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { uint32_t *tl; nfsv4stateid_t stateid; int error = 0, layoutlen, layouttype, iomode, maxcnt, retonclose; uint64_t offset, len, minlen; char *layp; struct thread *p = curthread; NFSM_DISSECT(tl, uint32_t *, 4 * NFSX_UNSIGNED + 3 * NFSX_HYPER + NFSX_STATEID); tl++; /* Signal layout available. Ignore for now. */ layouttype = fxdr_unsigned(int, *tl++); iomode = fxdr_unsigned(int, *tl++); offset = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); tl += 2; minlen = fxdr_hyper(tl); tl += 2; stateid.seqid = fxdr_unsigned(uint32_t, *tl++); NFSBCOPY(tl, stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); maxcnt = fxdr_unsigned(int, *tl); NFSD_DEBUG(4, "layoutget ltyp=%d iom=%d off=%ju len=%ju mlen=%ju\n", layouttype, iomode, (uintmax_t)offset, (uintmax_t)len, (uintmax_t)minlen); if (len < minlen || (minlen != UINT64_MAX && offset + minlen < offset) || (len != UINT64_MAX && offset + len < offset)) { nd->nd_repstat = NFSERR_INVAL; goto nfsmout; } /* * For the special stateid of other all 0s and seqid == 1, set the * stateid to the current stateid, if it is set. */ if (stateid.seqid == 1 && stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) { stateid = nd->nd_curstateid; stateid.seqid = 0; } else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } layp = NULL; if (layouttype == NFSLAYOUT_NFSV4_1_FILES && nfsrv_maxpnfsmirror == 1) layp = malloc(NFSX_V4FILELAYOUT, M_TEMP, M_WAITOK); else if (layouttype == NFSLAYOUT_FLEXFILE) layp = malloc(NFSX_V4FLEXLAYOUT(nfsrv_maxpnfsmirror), M_TEMP, M_WAITOK); else nd->nd_repstat = NFSERR_UNKNLAYOUTTYPE; if (layp != NULL) nd->nd_repstat = nfsrv_layoutget(nd, vp, exp, layouttype, &iomode, &offset, &len, minlen, &stateid, maxcnt, &retonclose, &layoutlen, layp, nd->nd_cred, p); NFSD_DEBUG(4, "nfsrv_layoutget stat=%u layoutlen=%d\n", nd->nd_repstat, layoutlen); if (nd->nd_repstat == 0) { /* For NFSv4.1, set the Current StateID. */ if ((nd->nd_flag & ND_NFSV41) != 0) { nd->nd_curstateid = stateid; nd->nd_flag |= ND_CURSTATEID; } NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED + NFSX_STATEID + 2 * NFSX_HYPER); *tl++ = txdr_unsigned(retonclose); *tl++ = txdr_unsigned(stateid.seqid); NFSBCOPY(stateid.other, tl, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); *tl++ = txdr_unsigned(1); /* Only returns one layout. */ txdr_hyper(offset, tl); tl += 2; txdr_hyper(len, tl); tl += 2; *tl++ = txdr_unsigned(iomode); *tl = txdr_unsigned(layouttype); nfsm_strtom(nd, layp, layoutlen); } else if (nd->nd_repstat == NFSERR_LAYOUTTRYLATER) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = newnfs_false; } free(layp, M_TEMP); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 layoutcommit service */ int nfsrvd_layoutcommit(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { uint32_t *tl; nfsv4stateid_t stateid; int error = 0, hasnewoff, hasnewmtime, layouttype, maxcnt, reclaim; int hasnewsize; uint64_t offset, len, newoff = 0, newsize; struct timespec newmtime; char *layp; struct thread *p = curthread; layp = NULL; NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + 2 * NFSX_HYPER + NFSX_STATEID); offset = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); tl += 2; reclaim = fxdr_unsigned(int, *tl++); stateid.seqid = fxdr_unsigned(uint32_t, *tl++); NFSBCOPY(tl, stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); /* * For the special stateid of other all 0s and seqid == 1, set the * stateid to the current stateid, if it is set. */ if (stateid.seqid == 1 && stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) { stateid = nd->nd_curstateid; stateid.seqid = 0; } else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } hasnewoff = fxdr_unsigned(int, *tl); if (hasnewoff != 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED); newoff = fxdr_hyper(tl); tl += 2; } else NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); hasnewmtime = fxdr_unsigned(int, *tl); if (hasnewmtime != 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_V4TIME + 2 * NFSX_UNSIGNED); fxdr_nfsv4time(tl, &newmtime); tl += (NFSX_V4TIME / NFSX_UNSIGNED); } else NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); layouttype = fxdr_unsigned(int, *tl++); maxcnt = fxdr_unsigned(int, *tl); if (maxcnt > 0) { layp = malloc(maxcnt + 1, M_TEMP, M_WAITOK); error = nfsrv_mtostr(nd, layp, maxcnt); if (error != 0) goto nfsmout; } nd->nd_repstat = nfsrv_layoutcommit(nd, vp, layouttype, hasnewoff, newoff, offset, len, hasnewmtime, &newmtime, reclaim, &stateid, maxcnt, layp, &hasnewsize, &newsize, nd->nd_cred, p); NFSD_DEBUG(4, "nfsrv_layoutcommit stat=%u\n", nd->nd_repstat); if (nd->nd_repstat == 0) { if (hasnewsize != 0) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED + NFSX_HYPER); *tl++ = newnfs_true; txdr_hyper(newsize, tl); } else { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = newnfs_false; } } nfsmout: free(layp, M_TEMP); vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 layoutreturn service */ int nfsrvd_layoutreturn(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { uint32_t *tl, *layp; nfsv4stateid_t stateid; int error = 0, fnd, kind, layouttype, iomode, maxcnt, reclaim; uint64_t offset, len; struct thread *p = curthread; layp = NULL; NFSM_DISSECT(tl, uint32_t *, 4 * NFSX_UNSIGNED); reclaim = *tl++; layouttype = fxdr_unsigned(int, *tl++); iomode = fxdr_unsigned(int, *tl++); kind = fxdr_unsigned(int, *tl); NFSD_DEBUG(4, "layoutreturn recl=%d ltyp=%d iom=%d kind=%d\n", reclaim, layouttype, iomode, kind); if (kind == NFSV4LAYOUTRET_FILE) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_STATEID + NFSX_UNSIGNED); offset = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); tl += 2; stateid.seqid = fxdr_unsigned(uint32_t, *tl++); NFSBCOPY(tl, stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); /* * For the special stateid of other all 0s and seqid == 1, set * the stateid to the current stateid, if it is set. */ if (stateid.seqid == 1 && stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) { stateid = nd->nd_curstateid; stateid.seqid = 0; } else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } maxcnt = fxdr_unsigned(int, *tl); /* * There is no fixed upper bound defined in the RFCs, * but 128Kbytes should be more than sufficient. */ if (maxcnt < 0 || maxcnt > 131072) maxcnt = 0; if (maxcnt > 0) { layp = malloc(maxcnt + 1, M_TEMP, M_WAITOK); error = nfsrv_mtostr(nd, (char *)layp, maxcnt); if (error != 0) goto nfsmout; } } else { if (reclaim == newnfs_true) { nd->nd_repstat = NFSERR_INVAL; goto nfsmout; } offset = len = 0; maxcnt = 0; } nd->nd_repstat = nfsrv_layoutreturn(nd, vp, layouttype, iomode, offset, len, reclaim, kind, &stateid, maxcnt, layp, &fnd, nd->nd_cred, p); NFSD_DEBUG(4, "nfsrv_layoutreturn stat=%u fnd=%d\n", nd->nd_repstat, fnd); if (nd->nd_repstat == 0) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); if (fnd != 0) { *tl = newnfs_true; NFSM_BUILD(tl, uint32_t *, NFSX_STATEID); *tl++ = txdr_unsigned(stateid.seqid); NFSBCOPY(stateid.other, tl, NFSX_STATEIDOTHER); } else *tl = newnfs_false; } nfsmout: free(layp, M_TEMP); vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 layout error service */ int nfsrvd_layouterror(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { uint32_t *tl; nfsv4stateid_t stateid; int cnt, error = 0, i, stat; int opnum __unused; char devid[NFSX_V4DEVICEID]; uint64_t offset, len; NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_STATEID + NFSX_UNSIGNED); offset = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); tl += 2; stateid.seqid = fxdr_unsigned(uint32_t, *tl++); NFSBCOPY(tl, stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); cnt = fxdr_unsigned(int, *tl); NFSD_DEBUG(4, "layouterror off=%ju len=%ju cnt=%d\n", (uintmax_t)offset, (uintmax_t)len, cnt); /* * For the special stateid of other all 0s and seqid == 1, set * the stateid to the current stateid, if it is set. */ if (stateid.seqid == 1 && stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) { stateid = nd->nd_curstateid; stateid.seqid = 0; } else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } /* * Ignore offset, len and stateid for now. */ for (i = 0; i < cnt; i++) { NFSM_DISSECT(tl, uint32_t *, NFSX_V4DEVICEID + 2 * NFSX_UNSIGNED); NFSBCOPY(tl, devid, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); stat = fxdr_unsigned(int, *tl++); opnum = fxdr_unsigned(int, *tl); NFSD_DEBUG(4, "nfsrvd_layouterr op=%d stat=%d\n", opnum, stat); /* * Except for NFSERR_ACCES, NFSERR_STALE and NFSERR_NOSPC * errors, disable the mirror. */ if (stat != NFSERR_ACCES && stat != NFSERR_STALE && stat != NFSERR_NOSPC) nfsrv_delds(devid, curthread); /* For NFSERR_NOSPC, mark all deviceids and layouts. */ if (stat == NFSERR_NOSPC) nfsrv_marknospc(devid, true); } nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 layout stats service */ int nfsrvd_layoutstats(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { uint32_t *tl; nfsv4stateid_t stateid; int cnt, error = 0; int layouttype __unused; char devid[NFSX_V4DEVICEID] __unused; uint64_t offset __unused, len __unused, readcount __unused; uint64_t readbytes __unused, writecount __unused, writebytes __unused; NFSM_DISSECT(tl, uint32_t *, 6 * NFSX_HYPER + NFSX_STATEID + NFSX_V4DEVICEID + 2 * NFSX_UNSIGNED); offset = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); tl += 2; stateid.seqid = fxdr_unsigned(uint32_t, *tl++); NFSBCOPY(tl, stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); readcount = fxdr_hyper(tl); tl += 2; readbytes = fxdr_hyper(tl); tl += 2; writecount = fxdr_hyper(tl); tl += 2; writebytes = fxdr_hyper(tl); tl += 2; NFSBCOPY(tl, devid, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); layouttype = fxdr_unsigned(int, *tl++); cnt = fxdr_unsigned(int, *tl); error = nfsm_advance(nd, NFSM_RNDUP(cnt), -1); if (error != 0) goto nfsmout; NFSD_DEBUG(4, "layoutstats cnt=%d\n", cnt); /* * For the special stateid of other all 0s and seqid == 1, set * the stateid to the current stateid, if it is set. */ if (stateid.seqid == 1 && stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) { stateid = nd->nd_curstateid; stateid.seqid = 0; } else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } /* * No use for the stats for now. */ nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 io_advise service */ int nfsrvd_ioadvise(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { uint32_t *tl; nfsv4stateid_t stateid; nfsattrbit_t hints; int error = 0, ret; off_t offset, len; NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID + 2 * NFSX_HYPER); stateid.seqid = fxdr_unsigned(uint32_t, *tl++); NFSBCOPY(tl, stateid.other, NFSX_STATEIDOTHER); tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED); offset = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); error = nfsrv_getattrbits(nd, &hints, NULL, NULL); if (error != 0) goto nfsmout; /* * For the special stateid of other all 0s and seqid == 1, set * the stateid to the current stateid, if it is set. */ if (stateid.seqid == 1 && stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { if ((nd->nd_flag & ND_CURSTATEID) != 0) { stateid = nd->nd_curstateid; stateid.seqid = 0; } else { nd->nd_repstat = NFSERR_BADSTATEID; goto nfsmout; } } if (offset < 0) { nd->nd_repstat = NFSERR_INVAL; goto nfsmout; } if (len < 0) len = 0; if (vp->v_type != VREG) { if (vp->v_type == VDIR) nd->nd_repstat = NFSERR_ISDIR; else nd->nd_repstat = NFSERR_WRONGTYPE; goto nfsmout; } /* * For now, we can only handle WILLNEED and DONTNEED and don't use * the stateid. */ if ((NFSISSET_ATTRBIT(&hints, NFSV4IOHINT_WILLNEED) && !NFSISSET_ATTRBIT(&hints, NFSV4IOHINT_DONTNEED)) || (NFSISSET_ATTRBIT(&hints, NFSV4IOHINT_DONTNEED) && !NFSISSET_ATTRBIT(&hints, NFSV4IOHINT_WILLNEED))) { NFSVOPUNLOCK(vp); if (NFSISSET_ATTRBIT(&hints, NFSV4IOHINT_WILLNEED)) { ret = VOP_ADVISE(vp, offset, len, POSIX_FADV_WILLNEED); NFSZERO_ATTRBIT(&hints); if (ret == 0) NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_WILLNEED); else NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_NORMAL); } else { ret = VOP_ADVISE(vp, offset, len, POSIX_FADV_DONTNEED); NFSZERO_ATTRBIT(&hints); if (ret == 0) NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_DONTNEED); else NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_NORMAL); } vrele(vp); } else { NFSZERO_ATTRBIT(&hints); NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_NORMAL); vput(vp); } nfsrv_putattrbit(nd, &hints); NFSEXITCODE2(error, nd); return (error); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 getdeviceinfo service */ int nfsrvd_getdevinfo(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl, maxcnt, notify[NFSV4_NOTIFYBITMAP]; int cnt, devaddrlen, error = 0, i, layouttype; char devid[NFSX_V4DEVICEID], *devaddr; time_t dev_time; NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED + NFSX_V4DEVICEID); NFSBCOPY(tl, devid, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); layouttype = fxdr_unsigned(int, *tl++); maxcnt = fxdr_unsigned(uint32_t, *tl++); cnt = fxdr_unsigned(int, *tl); NFSD_DEBUG(4, "getdevinfo ltyp=%d maxcnt=%u bitcnt=%d\n", layouttype, maxcnt, cnt); if (cnt > NFSV4_NOTIFYBITMAP || cnt < 0) { nd->nd_repstat = NFSERR_INVAL; goto nfsmout; } if (cnt > 0) { NFSM_DISSECT(tl, uint32_t *, cnt * NFSX_UNSIGNED); for (i = 0; i < cnt; i++) notify[i] = fxdr_unsigned(uint32_t, *tl++); } for (i = cnt; i < NFSV4_NOTIFYBITMAP; i++) notify[i] = 0; /* * Check that the device id is not stale. Device ids are recreated * each time the nfsd threads are restarted. */ NFSBCOPY(devid, &dev_time, sizeof(dev_time)); if (dev_time != nfsdev_time) { nd->nd_repstat = NFSERR_NOENT; goto nfsmout; } /* Look for the device id. */ nd->nd_repstat = nfsrv_getdevinfo(devid, layouttype, &maxcnt, notify, &devaddrlen, &devaddr); NFSD_DEBUG(4, "nfsrv_getdevinfo stat=%u\n", nd->nd_repstat); if (nd->nd_repstat == 0) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(layouttype); nfsm_strtom(nd, devaddr, devaddrlen); cnt = 0; for (i = 0; i < NFSV4_NOTIFYBITMAP; i++) { if (notify[i] != 0) cnt = i + 1; } NFSM_BUILD(tl, uint32_t *, (cnt + 1) * NFSX_UNSIGNED); *tl++ = txdr_unsigned(cnt); for (i = 0; i < cnt; i++) *tl++ = txdr_unsigned(notify[i]); } else if (nd->nd_repstat == NFSERR_TOOSMALL) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(maxcnt); } nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * nfsv4 test stateid service */ int nfsrvd_teststateid(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; nfsv4stateid_t *stateidp = NULL, *tstateidp; int cnt, error = 0, i, ret; struct thread *p = curthread; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); cnt = fxdr_unsigned(int, *tl); if (cnt <= 0 || cnt > 1024) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } stateidp = mallocarray(cnt, sizeof(nfsv4stateid_t), M_TEMP, M_WAITOK); tstateidp = stateidp; for (i = 0; i < cnt; i++) { NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID); tstateidp->seqid = fxdr_unsigned(uint32_t, *tl++); NFSBCOPY(tl, tstateidp->other, NFSX_STATEIDOTHER); tstateidp++; } NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(cnt); tstateidp = stateidp; for (i = 0; i < cnt; i++) { ret = nfsrv_teststateid(nd, tstateidp, p); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(ret); tstateidp++; } nfsmout: free(stateidp, M_TEMP); NFSEXITCODE2(error, nd); return (error); } /* * nfs allocate service */ int nfsrvd_allocate(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { uint32_t *tl; struct nfsvattr forat; int error = 0, forat_ret = 1, gotproxystateid; off_t off, len; struct nfsstate st, *stp = &st; struct nfslock lo, *lop = &lo; nfsv4stateid_t stateid; nfsquad_t clientid; nfsattrbit_t attrbits; if (!nfsrv_doallocate) { /* * If any exported file system, such as a ZFS one, cannot * do VOP_ALLOCATE(), this operation cannot be supported * for NFSv4.2. This cannot be done 'per filesystem', but * must be for the entire nfsd NFSv4.2 service. */ nd->nd_repstat = NFSERR_NOTSUPP; goto nfsmout; } gotproxystateid = 0; NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID + 2 * NFSX_HYPER); stp->ls_flags = (NFSLCK_CHECK | NFSLCK_WRITEACCESS); lop->lo_flags = NFSLCK_WRITE; stp->ls_ownerlen = 0; stp->ls_op = NULL; stp->ls_uid = nd->nd_cred->cr_uid; stp->ls_stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); clientid.lval[0] = stp->ls_stateid.other[0] = *tl++; clientid.lval[1] = stp->ls_stateid.other[1] = *tl++; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK2 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } stp->ls_stateid.other[2] = *tl++; /* * Don't allow this to be done for a DS. */ if ((nd->nd_flag & ND_DSSERVER) != 0) nd->nd_repstat = NFSERR_NOTSUPP; /* However, allow the proxy stateid. */ if (stp->ls_stateid.seqid == 0xffffffff && stp->ls_stateid.other[0] == 0x55555555 && stp->ls_stateid.other[1] == 0x55555555 && stp->ls_stateid.other[2] == 0x55555555) gotproxystateid = 1; off = fxdr_hyper(tl); tl += 2; lop->lo_first = off; len = fxdr_hyper(tl); lop->lo_end = lop->lo_first + len; /* * Sanity check the offset and length. * off and len are off_t (signed int64_t) whereas * lo_first and lo_end are uint64_t and, as such, * if off >= 0 && len > 0, lo_end cannot overflow * unless off_t is changed to something other than * int64_t. Check lo_end < lo_first in case that * is someday the case. */ if (nd->nd_repstat == 0 && (len <= 0 || off < 0 || lop->lo_end > OFF_MAX || lop->lo_end < lop->lo_first)) nd->nd_repstat = NFSERR_INVAL; if (nd->nd_repstat == 0 && vp->v_type != VREG) nd->nd_repstat = NFSERR_WRONGTYPE; NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNER); forat_ret = nfsvno_getattr(vp, &forat, nd, curthread, 1, &attrbits); if (nd->nd_repstat == 0) nd->nd_repstat = forat_ret; if (nd->nd_repstat == 0 && (forat.na_uid != nd->nd_cred->cr_uid || NFSVNO_EXSTRICTACCESS(exp))) nd->nd_repstat = nfsvno_accchk(vp, VWRITE, nd->nd_cred, exp, curthread, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat == 0 && gotproxystateid == 0) nd->nd_repstat = nfsrv_lockctrl(vp, &stp, &lop, NULL, clientid, &stateid, exp, nd, curthread); NFSD_DEBUG(4, "nfsrvd_allocate: off=%jd len=%jd stat=%d\n", (intmax_t)off, (intmax_t)len, nd->nd_repstat); if (nd->nd_repstat == 0) nd->nd_repstat = nfsvno_allocate(vp, off, len, nd->nd_cred, curthread); NFSD_DEBUG(4, "nfsrvd_allocate: aft nfsvno_allocate=%d\n", nd->nd_repstat); vput(vp); NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfs deallocate service */ int nfsrvd_deallocate(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { uint32_t *tl; struct nfsvattr forat; int error = 0, forat_ret = 1, gotproxystateid; off_t off, len; struct nfsstate st, *stp = &st; struct nfslock lo, *lop = &lo; nfsv4stateid_t stateid; nfsquad_t clientid; nfsattrbit_t attrbits; gotproxystateid = 0; NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID + 2 * NFSX_HYPER); stp->ls_flags = (NFSLCK_CHECK | NFSLCK_WRITEACCESS); lop->lo_flags = NFSLCK_WRITE; stp->ls_ownerlen = 0; stp->ls_op = NULL; stp->ls_uid = nd->nd_cred->cr_uid; stp->ls_stateid.seqid = fxdr_unsigned(u_int32_t, *tl++); clientid.lval[0] = stp->ls_stateid.other[0] = *tl++; clientid.lval[1] = stp->ls_stateid.other[1] = *tl++; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) { if ((nd->nd_flag & ND_NFSV41) != 0) clientid.qval = nd->nd_clientid.qval; else if (nd->nd_clientid.qval != clientid.qval) printf("EEK2 multiple clids\n"); } else { if ((nd->nd_flag & ND_NFSV41) != 0) printf("EEK! no clientid from session\n"); nd->nd_flag |= ND_IMPLIEDCLID; nd->nd_clientid.qval = clientid.qval; } stp->ls_stateid.other[2] = *tl++; /* * Don't allow this to be done for a DS. */ if ((nd->nd_flag & ND_DSSERVER) != 0) nd->nd_repstat = NFSERR_NOTSUPP; /* However, allow the proxy stateid. */ if (stp->ls_stateid.seqid == 0xffffffff && stp->ls_stateid.other[0] == 0x55555555 && stp->ls_stateid.other[1] == 0x55555555 && stp->ls_stateid.other[2] == 0x55555555) gotproxystateid = 1; off = fxdr_hyper(tl); tl += 2; lop->lo_first = off; len = fxdr_hyper(tl); if (len < 0) len = OFF_MAX; NFSD_DEBUG(4, "dealloc: off=%jd len=%jd\n", (intmax_t)off, (intmax_t)len); lop->lo_end = lop->lo_first + len; /* * Sanity check the offset and length. * off and len are off_t (signed int64_t) whereas * lo_first and lo_end are uint64_t and, as such, * if off >= 0 && len > 0, lo_end cannot overflow * unless off_t is changed to something other than * int64_t. Check lo_end < lo_first in case that * is someday the case. * The error to return is not specified by RFC 7862 so I * made this compatible with the Linux knfsd. */ if (nd->nd_repstat == 0) { if (off < 0 || lop->lo_end > NFSRV_MAXFILESIZE) nd->nd_repstat = NFSERR_FBIG; else if (len == 0 || lop->lo_end < lop->lo_first) nd->nd_repstat = NFSERR_INVAL; } if (nd->nd_repstat == 0 && vp->v_type != VREG) nd->nd_repstat = NFSERR_WRONGTYPE; NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNER); forat_ret = nfsvno_getattr(vp, &forat, nd, curthread, 1, &attrbits); if (nd->nd_repstat == 0) nd->nd_repstat = forat_ret; if (nd->nd_repstat == 0 && (forat.na_uid != nd->nd_cred->cr_uid || NFSVNO_EXSTRICTACCESS(exp))) nd->nd_repstat = nfsvno_accchk(vp, VWRITE, nd->nd_cred, exp, curthread, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat == 0 && gotproxystateid == 0) nd->nd_repstat = nfsrv_lockctrl(vp, &stp, &lop, NULL, clientid, &stateid, exp, nd, curthread); if (nd->nd_repstat == 0) nd->nd_repstat = nfsvno_deallocate(vp, off, len, nd->nd_cred, curthread); vput(vp); NFSD_DEBUG(4, "eo deallocate=%d\n", nd->nd_repstat); NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfs copy service */ int nfsrvd_copy_file_range(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, vnode_t tovp, struct nfsexstuff *exp, struct nfsexstuff *toexp) { uint32_t *tl; struct nfsvattr at; int cnt, error = 0, ret; off_t inoff, outoff; uint64_t len; size_t xfer; struct nfsstate inst, outst, *instp = &inst, *outstp = &outst; struct nfslock inlo, outlo, *inlop = &inlo, *outlop = &outlo; nfsquad_t clientid; nfsv4stateid_t stateid; nfsattrbit_t attrbits; void *rl_rcookie, *rl_wcookie; rl_rcookie = rl_wcookie = NULL; if (nfsrv_devidcnt > 0) { /* * For a pNFS server, reply NFSERR_NOTSUPP so that the client * will do the copy via I/O on the DS(s). */ nd->nd_repstat = NFSERR_NOTSUPP; goto nfsmout; } if (vp == tovp) { /* Copying a byte range within the same file is not allowed. */ nd->nd_repstat = NFSERR_INVAL; goto nfsmout; } NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_STATEID + 3 * NFSX_HYPER + 3 * NFSX_UNSIGNED); instp->ls_flags = (NFSLCK_CHECK | NFSLCK_READACCESS); inlop->lo_flags = NFSLCK_READ; instp->ls_ownerlen = 0; instp->ls_op = NULL; instp->ls_uid = nd->nd_cred->cr_uid; instp->ls_stateid.seqid = fxdr_unsigned(uint32_t, *tl++); clientid.lval[0] = instp->ls_stateid.other[0] = *tl++; clientid.lval[1] = instp->ls_stateid.other[1] = *tl++; if ((nd->nd_flag & ND_IMPLIEDCLID) != 0) clientid.qval = nd->nd_clientid.qval; instp->ls_stateid.other[2] = *tl++; outstp->ls_flags = (NFSLCK_CHECK | NFSLCK_WRITEACCESS); outlop->lo_flags = NFSLCK_WRITE; outstp->ls_ownerlen = 0; outstp->ls_op = NULL; outstp->ls_uid = nd->nd_cred->cr_uid; outstp->ls_stateid.seqid = fxdr_unsigned(uint32_t, *tl++); outstp->ls_stateid.other[0] = *tl++; outstp->ls_stateid.other[1] = *tl++; outstp->ls_stateid.other[2] = *tl++; inoff = fxdr_hyper(tl); tl += 2; inlop->lo_first = inoff; outoff = fxdr_hyper(tl); tl += 2; outlop->lo_first = outoff; len = fxdr_hyper(tl); tl += 2; if (len == 0) { /* len == 0 means to EOF. */ inlop->lo_end = OFF_MAX; outlop->lo_end = OFF_MAX; } else { inlop->lo_end = inlop->lo_first + len; outlop->lo_end = outlop->lo_first + len; } /* * At this time only consecutive, synchronous copy is supported, * so ca_consecutive and ca_synchronous can be ignored. */ tl += 2; cnt = fxdr_unsigned(int, *tl); if ((nd->nd_flag & ND_DSSERVER) != 0 || cnt != 0) nd->nd_repstat = NFSERR_NOTSUPP; if (nd->nd_repstat == 0 && (inoff > OFF_MAX || outoff > OFF_MAX || inlop->lo_end > OFF_MAX || outlop->lo_end > OFF_MAX || inlop->lo_end < inlop->lo_first || outlop->lo_end < outlop->lo_first)) nd->nd_repstat = NFSERR_INVAL; if (nd->nd_repstat == 0 && vp->v_type != VREG) nd->nd_repstat = NFSERR_WRONGTYPE; /* Check permissions for the input file. */ NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNER); ret = nfsvno_getattr(vp, &at, nd, curthread, 1, &attrbits); if (nd->nd_repstat == 0) nd->nd_repstat = ret; if (nd->nd_repstat == 0 && (at.na_uid != nd->nd_cred->cr_uid || NFSVNO_EXSTRICTACCESS(exp))) nd->nd_repstat = nfsvno_accchk(vp, VREAD, nd->nd_cred, exp, curthread, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat == 0) nd->nd_repstat = nfsrv_lockctrl(vp, &instp, &inlop, NULL, clientid, &stateid, exp, nd, curthread); NFSVOPUNLOCK(vp); if (nd->nd_repstat != 0) goto out; error = NFSVOPLOCK(tovp, LK_SHARED); if (error != 0) goto out; if (tovp->v_type != VREG) nd->nd_repstat = NFSERR_WRONGTYPE; /* For the output file, we only need the Owner attribute. */ ret = nfsvno_getattr(tovp, &at, nd, curthread, 1, &attrbits); if (nd->nd_repstat == 0) nd->nd_repstat = ret; if (nd->nd_repstat == 0 && (at.na_uid != nd->nd_cred->cr_uid || NFSVNO_EXSTRICTACCESS(exp))) nd->nd_repstat = nfsvno_accchk(tovp, VWRITE, nd->nd_cred, toexp, curthread, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat == 0) nd->nd_repstat = nfsrv_lockctrl(tovp, &outstp, &outlop, NULL, clientid, &stateid, toexp, nd, curthread); NFSVOPUNLOCK(tovp); /* Range lock the byte ranges for both invp and outvp. */ if (nd->nd_repstat == 0) { for (;;) { if (len == 0) { rl_wcookie = vn_rangelock_wlock(tovp, outoff, OFF_MAX); rl_rcookie = vn_rangelock_tryrlock(vp, inoff, OFF_MAX); } else { rl_wcookie = vn_rangelock_wlock(tovp, outoff, outoff + len); rl_rcookie = vn_rangelock_tryrlock(vp, inoff, inoff + len); } if (rl_rcookie != NULL) break; vn_rangelock_unlock(tovp, rl_wcookie); if (len == 0) rl_rcookie = vn_rangelock_rlock(vp, inoff, OFF_MAX); else rl_rcookie = vn_rangelock_rlock(vp, inoff, inoff + len); vn_rangelock_unlock(vp, rl_rcookie); } error = NFSVOPLOCK(vp, LK_SHARED); if (error == 0) { ret = nfsvno_getattr(vp, &at, nd, curthread, 1, NULL); if (ret == 0) { /* * Since invp is range locked, na_size should * not change. */ if (len == 0 && at.na_size > inoff) { /* * If len == 0, set it based on invp's * size. If offset is past EOF, just * leave len == 0. */ len = at.na_size - inoff; } else if (nfsrv_linux42server == 0 && inoff + len > at.na_size) { /* * RFC-7862 says that NFSERR_INVAL must * be returned when inoff + len exceeds * the file size, however the NFSv4.2 * Linux client likes to do this, so * only check if nfsrv_linux42server * is not set. */ nd->nd_repstat = NFSERR_INVAL; } } NFSVOPUNLOCK(vp); if (ret != 0 && nd->nd_repstat == 0) nd->nd_repstat = ret; } else if (nd->nd_repstat == 0) nd->nd_repstat = error; } xfer = len; if (nd->nd_repstat == 0) { nd->nd_repstat = vn_copy_file_range(vp, &inoff, tovp, &outoff, &xfer, COPY_FILE_RANGE_TIMEO1SEC, nd->nd_cred, nd->nd_cred, NULL); if (nd->nd_repstat == 0) len = xfer; } /* Unlock the ranges. */ if (rl_rcookie != NULL) vn_rangelock_unlock(vp, rl_rcookie); if (rl_wcookie != NULL) vn_rangelock_unlock(tovp, rl_wcookie); if (nd->nd_repstat == 0) { NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED + NFSX_HYPER + NFSX_VERF); *tl++ = txdr_unsigned(0); /* No callback ids. */ txdr_hyper(len, tl); tl += 2; *tl++ = txdr_unsigned(NFSWRITE_UNSTABLE); *tl++ = txdr_unsigned(nfsboottime.tv_sec); *tl++ = txdr_unsigned(nfsboottime.tv_usec); *tl++ = newnfs_true; *tl = newnfs_true; } out: vrele(vp); vrele(tovp); NFSEXITCODE2(error, nd); return (error); nfsmout: vput(vp); vrele(tovp); NFSEXITCODE2(error, nd); return (error); } /* * nfs seek service */ int nfsrvd_seek(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, struct nfsexstuff *exp) { uint32_t *tl; struct nfsvattr at; int content, error = 0; off_t off; u_long cmd; nfsattrbit_t attrbits; bool eof; NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID + NFSX_HYPER + NFSX_UNSIGNED); /* Ignore the stateid for now. */ tl += (NFSX_STATEID / NFSX_UNSIGNED); off = fxdr_hyper(tl); tl += 2; content = fxdr_unsigned(int, *tl); if (content == NFSV4CONTENT_DATA) cmd = FIOSEEKDATA; else if (content == NFSV4CONTENT_HOLE) cmd = FIOSEEKHOLE; else nd->nd_repstat = NFSERR_BADXDR; if (nd->nd_repstat == 0 && vp->v_type == VDIR) nd->nd_repstat = NFSERR_ISDIR; if (nd->nd_repstat == 0 && vp->v_type != VREG) nd->nd_repstat = NFSERR_WRONGTYPE; if (nd->nd_repstat == 0 && off < 0) nd->nd_repstat = NFSERR_NXIO; if (nd->nd_repstat == 0) { /* Check permissions for the input file. */ NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNER); nd->nd_repstat = nfsvno_getattr(vp, &at, nd, curthread, 1, &attrbits); } if (nd->nd_repstat == 0 && (at.na_uid != nd->nd_cred->cr_uid || NFSVNO_EXSTRICTACCESS(exp))) nd->nd_repstat = nfsvno_accchk(vp, VREAD, nd->nd_cred, exp, curthread, NFSACCCHK_ALLOWOWNER, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat != 0) goto nfsmout; /* nfsvno_seek() unlocks and vrele()s the vp. */ nd->nd_repstat = nfsvno_seek(nd, vp, cmd, &off, content, &eof, nd->nd_cred, curthread); if (nd->nd_repstat == 0 && eof && content == NFSV4CONTENT_DATA && nfsrv_linux42server != 0) nd->nd_repstat = NFSERR_NXIO; if (nd->nd_repstat == 0) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED + NFSX_HYPER); if (eof) *tl++ = newnfs_true; else *tl++ = newnfs_false; txdr_hyper(off, tl); } NFSEXITCODE2(error, nd); return (error); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * nfs get extended attribute service */ int nfsrvd_getxattr(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; struct mbuf *mp = NULL, *mpend = NULL; int error, len; char *name; struct thread *p = curthread; uint16_t off; error = 0; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len <= 0) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } if (len > EXTATTR_MAXNAMELEN) { nd->nd_repstat = NFSERR_NOXATTR; goto nfsmout; } name = malloc(len + 1, M_TEMP, M_WAITOK); nd->nd_repstat = nfsrv_mtostr(nd, name, len); if (nd->nd_repstat == 0) nd->nd_repstat = nfsvno_getxattr(vp, name, nd->nd_maxresp, nd->nd_cred, nd->nd_flag, nd->nd_maxextsiz, p, &mp, &mpend, &len); if (nd->nd_repstat == ENOATTR) nd->nd_repstat = NFSERR_NOXATTR; else if (nd->nd_repstat == EOPNOTSUPP) nd->nd_repstat = NFSERR_NOTSUPP; if (nd->nd_repstat == 0) { NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(len); if (len > 0) { nd->nd_mb->m_next = mp; nd->nd_mb = mpend; if ((mpend->m_flags & M_EXTPG) != 0) { nd->nd_flag |= ND_EXTPG; nd->nd_bextpg = mpend->m_epg_npgs - 1; nd->nd_bpos = (char *)(void *) PHYS_TO_DMAP(mpend->m_epg_pa[nd->nd_bextpg]); off = (nd->nd_bextpg == 0) ? mpend->m_epg_1st_off : 0; nd->nd_bpos += off + mpend->m_epg_last_len; nd->nd_bextpgsiz = PAGE_SIZE - mpend->m_epg_last_len - off; } else nd->nd_bpos = mtod(mpend, char *) + mpend->m_len; } } free(name, M_TEMP); nfsmout: if (nd->nd_repstat == 0) nd->nd_repstat = error; vput(vp); NFSEXITCODE2(0, nd); return (0); } /* * nfs set extended attribute service */ int nfsrvd_setxattr(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; struct nfsvattr ova, nva; nfsattrbit_t attrbits; int error, len, opt; char *name; size_t siz; struct thread *p = curthread; error = 0; name = NULL; NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); opt = fxdr_unsigned(int, *tl++); len = fxdr_unsigned(int, *tl); if (len <= 0) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } if (len > EXTATTR_MAXNAMELEN) { nd->nd_repstat = NFSERR_NOXATTR; goto nfsmout; } name = malloc(len + 1, M_TEMP, M_WAITOK); error = nfsrv_mtostr(nd, name, len); if (error != 0) goto nfsmout; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len < 0 || len > IOSIZE_MAX) { nd->nd_repstat = NFSERR_XATTR2BIG; goto nfsmout; } switch (opt) { case NFSV4SXATTR_CREATE: error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, NULL, &siz, nd->nd_cred, p); if (error != ENOATTR) nd->nd_repstat = NFSERR_EXIST; error = 0; break; case NFSV4SXATTR_REPLACE: error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, NULL, &siz, nd->nd_cred, p); if (error != 0) nd->nd_repstat = NFSERR_NOXATTR; break; case NFSV4SXATTR_EITHER: break; default: nd->nd_repstat = NFSERR_BADXDR; } if (nd->nd_repstat != 0) goto nfsmout; /* Now, do the Set Extended attribute, with Change before and after. */ NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE); nd->nd_repstat = nfsvno_getattr(vp, &ova, nd, p, 1, &attrbits); if (nd->nd_repstat == 0) { nd->nd_repstat = nfsvno_setxattr(vp, name, len, nd->nd_md, nd->nd_dpos, nd->nd_cred, p); if (nd->nd_repstat == ENXIO) nd->nd_repstat = NFSERR_XATTR2BIG; } if (nd->nd_repstat == 0 && len > 0) nd->nd_repstat = nfsm_advance(nd, NFSM_RNDUP(len), -1); if (nd->nd_repstat == 0) nd->nd_repstat = nfsvno_getattr(vp, &nva, nd, p, 1, &attrbits); if (nd->nd_repstat == 0) { NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_UNSIGNED); *tl++ = newnfs_true; txdr_hyper(ova.na_filerev, tl); tl += 2; txdr_hyper(nva.na_filerev, tl); } nfsmout: free(name, M_TEMP); if (nd->nd_repstat == 0) nd->nd_repstat = error; vput(vp); NFSEXITCODE2(0, nd); return (0); } /* * nfs remove extended attribute service */ int nfsrvd_rmxattr(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t *tl; struct nfsvattr ova, nva; nfsattrbit_t attrbits; int error, len; char *name; struct thread *p = curthread; error = 0; name = NULL; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len <= 0) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } if (len > EXTATTR_MAXNAMELEN) { nd->nd_repstat = NFSERR_NOXATTR; goto nfsmout; } name = malloc(len + 1, M_TEMP, M_WAITOK); error = nfsrv_mtostr(nd, name, len); if (error != 0) goto nfsmout; if ((nd->nd_flag & ND_IMPLIEDCLID) == 0) { printf("EEK! nfsrvd_rmxattr: no implied clientid\n"); error = NFSERR_NOXATTR; goto nfsmout; } /* * Now, do the Remove Extended attribute, with Change before and * after. */ NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE); nd->nd_repstat = nfsvno_getattr(vp, &ova, nd, p, 1, &attrbits); if (nd->nd_repstat == 0) { nd->nd_repstat = nfsvno_rmxattr(nd, vp, name, nd->nd_cred, p); if (nd->nd_repstat == ENOATTR) nd->nd_repstat = NFSERR_NOXATTR; } if (nd->nd_repstat == 0) nd->nd_repstat = nfsvno_getattr(vp, &nva, nd, p, 1, &attrbits); if (nd->nd_repstat == 0) { NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_UNSIGNED); *tl++ = newnfs_true; txdr_hyper(ova.na_filerev, tl); tl += 2; txdr_hyper(nva.na_filerev, tl); } nfsmout: free(name, M_TEMP); if (nd->nd_repstat == 0) nd->nd_repstat = error; vput(vp); NFSEXITCODE2(0, nd); return (0); } /* * nfs list extended attribute service */ int nfsrvd_listxattr(struct nfsrv_descript *nd, __unused int isdgram, vnode_t vp, __unused struct nfsexstuff *exp) { uint32_t cnt, *tl, len, len2, i, pos, retlen; int error; uint64_t cookie, cookie2; u_char *buf; bool eof; struct thread *p = curthread; error = 0; buf = NULL; NFSM_DISSECT(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED); /* * The cookie doesn't need to be in net byte order, but FreeBSD * does so to make it more readable in packet traces. */ cookie = fxdr_hyper(tl); tl += 2; len = fxdr_unsigned(uint32_t, *tl); if (len == 0 || cookie >= IOSIZE_MAX) { nd->nd_repstat = NFSERR_BADXDR; goto nfsmout; } if (len > nd->nd_maxresp - NFS_MAXXDR) len = nd->nd_maxresp - NFS_MAXXDR; len2 = len; nd->nd_repstat = nfsvno_listxattr(vp, cookie, nd->nd_cred, p, &buf, &len, &eof); if (nd->nd_repstat == EOPNOTSUPP) nd->nd_repstat = NFSERR_NOTSUPP; if (nd->nd_repstat == 0) { cookie2 = cookie + len; if (cookie2 < cookie) nd->nd_repstat = NFSERR_BADXDR; } retlen = NFSX_HYPER + 2 * NFSX_UNSIGNED; if (nd->nd_repstat == 0 && len2 < retlen) nd->nd_repstat = NFSERR_TOOSMALL; if (nd->nd_repstat == 0) { /* Now copy the entries out. */ if (len == 0) { /* The cookie was at eof. */ NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); txdr_hyper(cookie2, tl); tl += 2; *tl++ = txdr_unsigned(0); *tl = newnfs_true; goto nfsmout; } /* Sanity check the cookie. */ for (pos = 0; pos < len; pos += (i + 1)) { if (pos == cookie) break; i = buf[pos]; } if (pos != cookie) { nd->nd_repstat = NFSERR_INVAL; goto nfsmout; } /* Loop around copying the entrie(s) out. */ cnt = 0; len -= cookie; i = buf[pos]; while (i < len && len2 >= retlen + NFSM_RNDUP(i) + NFSX_UNSIGNED) { if (cnt == 0) { NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED); txdr_hyper(cookie2, tl); tl += 2; } retlen += nfsm_strtom(nd, &buf[pos + 1], i); len -= (i + 1); pos += (i + 1); i = buf[pos]; cnt++; } /* * eof is set true/false by nfsvno_listxattr(), but if we * can't copy all entries returned by nfsvno_listxattr(), * we are not at eof. */ if (len > 0) eof = false; if (cnt > 0) { /* *tl is set above. */ *tl = txdr_unsigned(cnt); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); if (eof) *tl = newnfs_true; else *tl = newnfs_false; } else nd->nd_repstat = NFSERR_TOOSMALL; } nfsmout: free(buf, M_TEMP); if (nd->nd_repstat == 0) nd->nd_repstat = error; vput(vp); NFSEXITCODE2(0, nd); return (0); } /* * nfsv4 service not supported */ int nfsrvd_notsupp(struct nfsrv_descript *nd, __unused int isdgram, __unused vnode_t vp, __unused struct nfsexstuff *exp) { nd->nd_repstat = NFSERR_NOTSUPP; NFSEXITCODE2(0, nd); return (0); } diff --git a/sys/fs/smbfs/smbfs_vnops.c b/sys/fs/smbfs/smbfs_vnops.c index 8df399b392f9..044745111543 100644 --- a/sys/fs/smbfs/smbfs_vnops.c +++ b/sys/fs/smbfs/smbfs_vnops.c @@ -1,1390 +1,1385 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Prototypes for SMBFS vnode operations */ static vop_create_t smbfs_create; static vop_mknod_t smbfs_mknod; static vop_open_t smbfs_open; static vop_close_t smbfs_close; static vop_access_t smbfs_access; static vop_getattr_t smbfs_getattr; static vop_setattr_t smbfs_setattr; static vop_read_t smbfs_read; static vop_write_t smbfs_write; static vop_fsync_t smbfs_fsync; static vop_remove_t smbfs_remove; static vop_link_t smbfs_link; static vop_lookup_t smbfs_lookup; static vop_rename_t smbfs_rename; static vop_mkdir_t smbfs_mkdir; static vop_rmdir_t smbfs_rmdir; static vop_symlink_t smbfs_symlink; static vop_readdir_t smbfs_readdir; static vop_strategy_t smbfs_strategy; static vop_print_t smbfs_print; static vop_pathconf_t smbfs_pathconf; static vop_advlock_t smbfs_advlock; static vop_getextattr_t smbfs_getextattr; struct vop_vector smbfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = smbfs_access, .vop_advlock = smbfs_advlock, .vop_close = smbfs_close, .vop_create = smbfs_create, .vop_fsync = smbfs_fsync, .vop_getattr = smbfs_getattr, .vop_getextattr = smbfs_getextattr, .vop_getpages = smbfs_getpages, .vop_inactive = smbfs_inactive, .vop_ioctl = smbfs_ioctl, .vop_link = smbfs_link, .vop_lookup = smbfs_lookup, .vop_mkdir = smbfs_mkdir, .vop_mknod = smbfs_mknod, .vop_open = smbfs_open, .vop_pathconf = smbfs_pathconf, .vop_print = smbfs_print, .vop_putpages = smbfs_putpages, .vop_read = smbfs_read, .vop_readdir = smbfs_readdir, .vop_reclaim = smbfs_reclaim, .vop_remove = smbfs_remove, .vop_rename = smbfs_rename, .vop_rmdir = smbfs_rmdir, .vop_setattr = smbfs_setattr, /* .vop_setextattr = smbfs_setextattr,*/ .vop_strategy = smbfs_strategy, .vop_symlink = smbfs_symlink, .vop_write = smbfs_write, }; VFS_VOP_VECTOR_REGISTER(smbfs_vnodeops); static int smbfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; accmode_t accmode = ap->a_accmode; mode_t mpmode; struct smbmount *smp = VTOSMBFS(vp); SMBVDEBUG("\n"); if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return EROFS; default: break; } } mpmode = vp->v_type == VREG ? smp->sm_file_mode : smp->sm_dir_mode; return (vaccess(vp->v_type, mpmode, smp->sm_uid, smp->sm_gid, ap->a_accmode, ap->a_cred)); } /* ARGSUSED */ static int smbfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; struct vattr vattr; int mode = ap->a_mode; int error, accmode; SMBVDEBUG("%s,%d\n", np->n_name, (np->n_flag & NOPEN) != 0); if (vp->v_type != VREG && vp->v_type != VDIR) { SMBFSERR("open eacces vtype=%d\n", vp->v_type); return EACCES; } if (vp->v_type == VDIR) { np->n_flag |= NOPEN; return 0; } if (np->n_flag & NMODIFIED) { if ((error = smbfs_vinvalbuf(vp, ap->a_td)) == EINTR) return error; smbfs_attr_cacheremove(vp); error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return error; np->n_mtime.tv_sec = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return error; if (np->n_mtime.tv_sec != vattr.va_mtime.tv_sec) { error = smbfs_vinvalbuf(vp, ap->a_td); if (error == EINTR) return error; np->n_mtime.tv_sec = vattr.va_mtime.tv_sec; } } if ((np->n_flag & NOPEN) != 0) return 0; /* * Use DENYNONE to give unixy semantics of permitting * everything not forbidden by permissions. Ie denial * is up to server with clients/openers needing to use * advisory locks for further control. */ accmode = SMB_SM_DENYNONE|SMB_AM_OPENREAD; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) accmode = SMB_SM_DENYNONE|SMB_AM_OPENRW; scred = smbfs_malloc_scred(); smb_makescred(scred, ap->a_td, ap->a_cred); error = smbfs_smb_open(np, accmode, scred); if (error) { if (mode & FWRITE) return EACCES; else if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { accmode = SMB_SM_DENYNONE|SMB_AM_OPENREAD; error = smbfs_smb_open(np, accmode, scred); } } if (error == 0) { np->n_flag |= NOPEN; vnode_create_vobject(ap->a_vp, vattr.va_size, ap->a_td); } smbfs_attr_cacheremove(vp); smbfs_free_scred(scred); return error; } static int smbfs_close(ap) struct vop_close_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; if (vp->v_type == VDIR && (np->n_flag & NOPEN) != 0 && np->n_dirseq != NULL) { scred = smbfs_malloc_scred(); smb_makescred(scred, td, ap->a_cred); smbfs_findclose(np->n_dirseq, scred); smbfs_free_scred(scred); np->n_dirseq = NULL; } return 0; } /* * smbfs_getattr call from vfs. */ static int smbfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct vattr *va=ap->a_vap; struct smbfattr fattr; struct smb_cred *scred; u_quad_t oldsize; int error; SMBVDEBUG("%lx: '%s' %d\n", (long)vp, np->n_name, (vp->v_vflag & VV_ROOT) != 0); error = smbfs_attr_cachelookup(vp, va); if (!error) return 0; SMBVDEBUG("not in the cache\n"); scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, ap->a_cred); oldsize = np->n_size; error = smbfs_smb_lookup(np, NULL, 0, &fattr, scred); if (error) { SMBVDEBUG("error %d\n", error); smbfs_free_scred(scred); return error; } smbfs_attr_cacheenter(vp, &fattr); smbfs_attr_cachelookup(vp, va); if (np->n_flag & NOPEN) np->n_size = oldsize; smbfs_free_scred(scred); return 0; } static int smbfs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct vattr *vap = ap->a_vap; struct timespec *mtime, *atime; struct smb_cred *scred; struct smb_share *ssp = np->n_mount->sm_share; struct smb_vc *vcp = SSTOVC(ssp); struct thread *td = curthread; u_quad_t tsize = 0; int isreadonly, doclose, error = 0; int old_n_dosattr; SMBVDEBUG("\n"); isreadonly = (vp->v_mount->mnt_flag & MNT_RDONLY); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL || vap->va_flags != VNOVAL) && isreadonly) return EROFS; /* * We only support setting four flags. Don't allow setting others. * * We map UF_READONLY to SMB_FA_RDONLY, unlike the MacOS X version * of this code, which maps both UF_IMMUTABLE AND SF_IMMUTABLE to * SMB_FA_RDONLY. The immutable flags have different semantics * than readonly, which is the reason for the difference. */ if (vap->va_flags != VNOVAL) { if (vap->va_flags & ~(UF_HIDDEN|UF_SYSTEM|UF_ARCHIVE| UF_READONLY)) return EINVAL; } scred = smbfs_malloc_scred(); smb_makescred(scred, td, ap->a_cred); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: error = EISDIR; goto out; case VREG: break; default: error = EINVAL; goto out; } if (isreadonly) { error = EROFS; goto out; } doclose = 0; vnode_pager_setsize(vp, (u_long)vap->va_size); tsize = np->n_size; np->n_size = vap->va_size; if ((np->n_flag & NOPEN) == 0) { error = smbfs_smb_open(np, SMB_SM_DENYNONE|SMB_AM_OPENRW, scred); if (error == 0) doclose = 1; } if (error == 0) error = smbfs_smb_setfsize(np, (int64_t)vap->va_size, scred); if (doclose) smbfs_smb_close(ssp, np->n_fid, NULL, scred); if (error) { np->n_size = tsize; vnode_pager_setsize(vp, (u_long)tsize); goto out; } } if ((vap->va_flags != VNOVAL) || (vap->va_mode != (mode_t)VNOVAL)) { old_n_dosattr = np->n_dosattr; if (vap->va_mode != (mode_t)VNOVAL) { if (vap->va_mode & S_IWUSR) np->n_dosattr &= ~SMB_FA_RDONLY; else np->n_dosattr |= SMB_FA_RDONLY; } if (vap->va_flags != VNOVAL) { if (vap->va_flags & UF_HIDDEN) np->n_dosattr |= SMB_FA_HIDDEN; else np->n_dosattr &= ~SMB_FA_HIDDEN; if (vap->va_flags & UF_SYSTEM) np->n_dosattr |= SMB_FA_SYSTEM; else np->n_dosattr &= ~SMB_FA_SYSTEM; if (vap->va_flags & UF_ARCHIVE) np->n_dosattr |= SMB_FA_ARCHIVE; else np->n_dosattr &= ~SMB_FA_ARCHIVE; /* * We only support setting the immutable / readonly * bit for regular files. According to comments in * the MacOS X version of this code, supporting the * readonly bit on directories doesn't do the same * thing in Windows as in Unix. */ if (vp->v_type == VREG) { if (vap->va_flags & UF_READONLY) np->n_dosattr |= SMB_FA_RDONLY; else np->n_dosattr &= ~SMB_FA_RDONLY; } } if (np->n_dosattr != old_n_dosattr) { error = smbfs_smb_setpattr(np, np->n_dosattr, NULL, scred); if (error) goto out; } } mtime = atime = NULL; if (vap->va_mtime.tv_sec != VNOVAL) mtime = &vap->va_mtime; if (vap->va_atime.tv_sec != VNOVAL) atime = &vap->va_atime; if (mtime != atime) { if (vap->va_vaflags & VA_UTIMES_NULL) { error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td); if (error) error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td); } else error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td); #if 0 if (mtime == NULL) mtime = &np->n_mtime; if (atime == NULL) atime = &np->n_atime; #endif /* * If file is opened, then we can use handle based calls. * If not, use path based ones. */ if ((np->n_flag & NOPEN) == 0) { if (vcp->vc_flags & SMBV_WIN95) { error = VOP_OPEN(vp, FWRITE, ap->a_cred, td, NULL); if (!error) { /* error = smbfs_smb_setfattrNT(np, 0, mtime, atime, scred); VOP_GETATTR(vp, &vattr, ap->a_cred); */ if (mtime) np->n_mtime = *mtime; VOP_CLOSE(vp, FWRITE, ap->a_cred, td); } } else if ((vcp->vc_sopt.sv_caps & SMB_CAP_NT_SMBS)) { error = smbfs_smb_setptime2(np, mtime, atime, 0, scred); /* error = smbfs_smb_setpattrNT(np, 0, mtime, atime, scred);*/ } else if (SMB_DIALECT(vcp) >= SMB_DIALECT_LANMAN2_0) { error = smbfs_smb_setptime2(np, mtime, atime, 0, scred); } else { error = smbfs_smb_setpattr(np, 0, mtime, scred); } } else { if (vcp->vc_sopt.sv_caps & SMB_CAP_NT_SMBS) { error = smbfs_smb_setfattrNT(np, 0, mtime, atime, scred); } else if (SMB_DIALECT(vcp) >= SMB_DIALECT_LANMAN1_0) { error = smbfs_smb_setftime(np, mtime, atime, scred); } else { /* * I have no idea how to handle this for core * level servers. The possible solution is to * update mtime after file is closed. */ SMBERROR("can't update times on an opened file\n"); } } } /* * Invalidate attribute cache in case if server doesn't set * required attributes. */ smbfs_attr_cacheremove(vp); /* invalidate cache */ VOP_GETATTR(vp, vap, ap->a_cred); np->n_mtime.tv_sec = vap->va_mtime.tv_sec; out: smbfs_free_scred(scred); return error; } /* * smbfs_read call. */ static int smbfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; SMBVDEBUG("\n"); if (vp->v_type != VREG && vp->v_type != VDIR) return EPERM; return smbfs_readvnode(vp, uio, ap->a_cred); } static int smbfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; SMBVDEBUG("%d,ofs=%jd,sz=%zd\n",vp->v_type, (intmax_t)uio->uio_offset, uio->uio_resid); if (vp->v_type != VREG) return (EPERM); return smbfs_writevnode(vp, uio, ap->a_cred,ap->a_ioflag); } /* * smbfs_create call * Create a regular file. On entry the directory to contain the file being * created is locked. We must release before we return. We must also free * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or * only if the SAVESTART bit in cn_flags is clear on success. */ static int smbfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct vnode **vpp=ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct smbnode *dnp = VTOSMB(dvp); struct vnode *vp; struct vattr vattr; struct smbfattr fattr; struct smb_cred *scred; char *name = cnp->cn_nameptr; int nmlen = cnp->cn_namelen; int error; SMBVDEBUG("\n"); *vpp = NULL; if (vap->va_type != VREG) return EOPNOTSUPP; if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred))) return error; scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, cnp->cn_cred); error = smbfs_smb_create(dnp, name, nmlen, scred); if (error) goto out; error = smbfs_smb_lookup(dnp, name, nmlen, &fattr, scred); if (error) goto out; error = smbfs_nget(VTOVFS(dvp), dvp, name, nmlen, &fattr, &vp); if (error) goto out; *vpp = vp; if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, vp, cnp); out: smbfs_free_scred(scred); return error; } static int smbfs_remove(ap) struct vop_remove_args /* { struct vnodeop_desc *a_desc; struct vnode * a_dvp; struct vnode * a_vp; struct componentname * a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; /* struct vnode *dvp = ap->a_dvp;*/ struct componentname *cnp = ap->a_cnp; struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; int error; if (vp->v_type == VDIR || (np->n_flag & NOPEN) != 0 || vrefcnt(vp) != 1) return EPERM; scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, cnp->cn_cred); error = smbfs_smb_delete(np, scred); if (error == 0) np->n_flag |= NGONE; cache_purge(vp); smbfs_free_scred(scred); return error; } /* * smbfs_file rename call */ static int smbfs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *fvp = ap->a_fvp; struct vnode *tvp = ap->a_tvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tdvp = ap->a_tdvp; struct componentname *tcnp = ap->a_tcnp; /* struct componentname *fcnp = ap->a_fcnp;*/ struct smb_cred *scred; #ifdef notnow u_int16_t flags = 6; #endif int error=0; scred = NULL; /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } if (tvp && vrefcnt(tvp) > 1) { error = EBUSY; goto out; } #ifdef notnow flags = 0x10; /* verify all writes */ #endif if (fvp->v_type == VDIR) { #ifdef notnow flags |= 2; #endif } else if (fvp->v_type == VREG) { #ifdef notnow flags |= 1; #endif } else { return EINVAL; } scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, tcnp->cn_cred); /* * It seems that Samba doesn't implement SMB_COM_MOVE call... */ #ifdef notnow if (SMB_DIALECT(SSTOCN(smp->sm_share)) >= SMB_DIALECT_LANMAN1_0) { error = smbfs_smb_move(VTOSMB(fvp), VTOSMB(tdvp), tcnp->cn_nameptr, tcnp->cn_namelen, flags, scred); } else #endif { /* * We have to do the work atomicaly */ if (tvp && tvp != fvp) { error = smbfs_smb_delete(VTOSMB(tvp), scred); if (error) goto out_cacherem; VTOSMB(fvp)->n_flag |= NGONE; } error = smbfs_smb_rename(VTOSMB(fvp), VTOSMB(tdvp), tcnp->cn_nameptr, tcnp->cn_namelen, scred); } if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out_cacherem: smbfs_attr_cacheremove(fdvp); smbfs_attr_cacheremove(tdvp); out: smbfs_free_scred(scred); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); #ifdef possible_mistake vgone(fvp); if (tvp) vgone(tvp); #endif return error; } /* * somtime it will come true... */ static int smbfs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { return EOPNOTSUPP; } /* * smbfs_symlink link create call. * Sometime it will be functional... */ static int smbfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { return EOPNOTSUPP; } static int smbfs_mknod(ap) struct vop_mknod_args /* { } */ *ap; { return EOPNOTSUPP; } static int smbfs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; /* struct vattr *vap = ap->a_vap;*/ struct vnode *vp; struct componentname *cnp = ap->a_cnp; struct smbnode *dnp = VTOSMB(dvp); struct vattr vattr; struct smb_cred *scred; struct smbfattr fattr; char *name = cnp->cn_nameptr; int len = cnp->cn_namelen; int error; if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred))) { return error; } if ((name[0] == '.') && ((len == 1) || ((len == 2) && (name[1] == '.')))) return EEXIST; scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, cnp->cn_cred); error = smbfs_smb_mkdir(dnp, name, len, scred); if (error) goto out; error = smbfs_smb_lookup(dnp, name, len, &fattr, scred); if (error) goto out; error = smbfs_nget(VTOVFS(dvp), dvp, name, len, &fattr, &vp); if (error) goto out; *ap->a_vpp = vp; out: smbfs_free_scred(scred); return error; } /* * smbfs_remove directory call */ static int smbfs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; /* struct smbmount *smp = VTOSMBFS(vp);*/ struct smbnode *dnp = VTOSMB(dvp); struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; int error; if (dvp == vp) return EINVAL; scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, cnp->cn_cred); error = smbfs_smb_rmdir(np, scred); if (error == 0) np->n_flag |= NGONE; dnp->n_flag |= NMODIFIED; smbfs_attr_cacheremove(dvp); /* cache_purge(dvp);*/ cache_purge(vp); smbfs_free_scred(scred); return error; } /* * smbfs_readdir call */ static int smbfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; uint64_t *a_cookies; int a_ncookies; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int error; if (vp->v_type != VDIR) return (EPERM); #ifdef notnow if (ap->a_ncookies) { printf("smbfs_readdir: no support for cookies now..."); return (EOPNOTSUPP); } #endif error = smbfs_readvnode(vp, uio, ap->a_cred); return error; } /* ARGSUSED */ static int smbfs_fsync(ap) struct vop_fsync_args /* { struct vnodeop_desc *a_desc; struct vnode * a_vp; struct ucred * a_cred; int a_waitfor; struct thread * a_td; } */ *ap; { /* return (smb_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_td, 1));*/ return (0); } static int smbfs_print (ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); if (np == NULL) { printf("no smbnode data\n"); return (0); } printf("\tname = %s, parent = %p, open = %d\n", np->n_name, np->n_parent ? np->n_parent : NULL, (np->n_flag & NOPEN) != 0); return (0); } static int smbfs_pathconf (ap) struct vop_pathconf_args /* { struct vnode *vp; int name; register_t *retval; } */ *ap; { struct smbmount *smp = VFSTOSMBFS(VTOVFS(ap->a_vp)); struct smb_vc *vcp = SSTOVC(smp->sm_share); long *retval = ap->a_retval; int error = 0; switch (ap->a_name) { case _PC_FILESIZEBITS: if (vcp->vc_sopt.sv_caps & (SMB_CAP_LARGE_READX | SMB_CAP_LARGE_WRITEX)) *retval = 64; else *retval = 32; break; case _PC_NAME_MAX: *retval = (vcp->vc_hflags2 & SMB_FLAGS2_KNOWS_LONG_NAMES) ? 255 : 12; break; case _PC_PATH_MAX: *retval = 800; /* XXX: a correct one ? */ break; case _PC_NO_TRUNC: *retval = 1; break; default: error = vop_stdpathconf(ap); } return error; } static int smbfs_strategy (ap) struct vop_strategy_args /* { struct buf *a_bp } */ *ap; { struct buf *bp=ap->a_bp; struct ucred *cr; struct thread *td; SMBVDEBUG("\n"); if (bp->b_flags & B_ASYNC) td = (struct thread *)0; else td = curthread; /* XXX */ if (bp->b_iocmd == BIO_READ) cr = bp->b_rcred; else cr = bp->b_wcred; if ((bp->b_flags & B_ASYNC) == 0 ) (void)smbfs_doio(ap->a_vp, bp, cr, td); return (0); } int smbfs_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int fflag; struct ucred *cred; struct thread *td; } */ *ap; { return ENOTTY; } static char smbfs_atl[] = "rhsvda"; static int smbfs_getextattr(struct vop_getextattr_args *ap) /* { IN struct vnode *a_vp; IN char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; struct uio *uio = ap->a_uio; const char *name = ap->a_name; struct smbnode *np = VTOSMB(vp); struct vattr vattr; char buf[10]; int i, attr, error; error = VOP_ACCESS(vp, VREAD, cred, td); if (error) return error; error = VOP_GETATTR(vp, &vattr, cred); if (error) return error; if (strcmp(name, "dosattr") == 0) { attr = np->n_dosattr; for (i = 0; i < 6; i++, attr >>= 1) buf[i] = (attr & 1) ? smbfs_atl[i] : '-'; buf[i] = 0; error = uiomove(buf, i, uio); } else error = EINVAL; return error; } /* * Since we expected to support F_GETLK (and SMB protocol has no such function), * it is necessary to use lf_advlock(). It would be nice if this function had * a callback mechanism because it will help to improve a level of consistency. */ int smbfs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct flock *fl = ap->a_fl; caddr_t id = (caddr_t)1 /* ap->a_id */; /* int flags = ap->a_flags;*/ struct thread *td = curthread; struct smb_cred *scred; u_quad_t size; off_t start, end, oadd; int error, lkop; if (vp->v_type == VDIR) { /* * SMB protocol have no support for directory locking. * Although locks can be processed on local machine, I don't * think that this is a good idea, because some programs * can work wrong assuming directory is locked. So, we just * return 'operation not supported */ return EOPNOTSUPP; } size = np->n_size; switch (fl->l_whence) { case SEEK_SET: case SEEK_CUR: start = fl->l_start; break; case SEEK_END: if (size > OFF_MAX || (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) return EOVERFLOW; start = size + fl->l_start; break; default: return EINVAL; } if (start < 0) return EINVAL; if (fl->l_len < 0) { if (start == 0) return EINVAL; end = start - 1; start += fl->l_len; if (start < 0) return EINVAL; } else if (fl->l_len == 0) end = -1; else { oadd = fl->l_len - 1; if (oadd > OFF_MAX - start) return EOVERFLOW; end = start + oadd; } scred = smbfs_malloc_scred(); smb_makescred(scred, td, td->td_ucred); switch (ap->a_op) { case F_SETLK: switch (fl->l_type) { case F_WRLCK: lkop = SMB_LOCK_EXCL; break; case F_RDLCK: lkop = SMB_LOCK_SHARED; break; case F_UNLCK: lkop = SMB_LOCK_RELEASE; break; default: smbfs_free_scred(scred); return EINVAL; } error = lf_advlock(ap, &vp->v_lockf, size); if (error) break; lkop = SMB_LOCK_EXCL; error = smbfs_smb_lock(np, lkop, id, start, end, scred); if (error) { int oldtype = fl->l_type; fl->l_type = F_UNLCK; ap->a_op = F_UNLCK; lf_advlock(ap, &vp->v_lockf, size); fl->l_type = oldtype; } break; case F_UNLCK: lf_advlock(ap, &vp->v_lockf, size); error = smbfs_smb_lock(np, SMB_LOCK_RELEASE, id, start, end, scred); break; case F_GETLK: error = lf_advlock(ap, &vp->v_lockf, size); break; default: smbfs_free_scred(scred); return EINVAL; } smbfs_free_scred(scred); return error; } static int smbfs_pathcheck(struct smbmount *smp, const char *name, int nmlen, int nameiop) { static const char *badchars = "*/:<>?"; static const char *badchars83 = " +|,[]=;"; const char *cp; int i, error; /* * Backslash characters, being a path delimiter, are prohibited * within a path component even for LOOKUP operations. */ if (strchr(name, '\\') != NULL) return ENOENT; if (nameiop == LOOKUP) return 0; error = ENOENT; if (SMB_DIALECT(SSTOVC(smp->sm_share)) < SMB_DIALECT_LANMAN2_0) { /* * Name should conform 8.3 format */ if (nmlen > 12) return ENAMETOOLONG; cp = strchr(name, '.'); if (cp == NULL) return error; if (cp == name || (cp - name) > 8) return error; cp = strchr(cp + 1, '.'); if (cp != NULL) return error; for (cp = name, i = 0; i < nmlen; i++, cp++) if (strchr(badchars83, *cp) != NULL) return error; } for (cp = name, i = 0; i < nmlen; i++, cp++) if (strchr(badchars, *cp) != NULL) return error; return 0; } /* * Things go even weird without fixed inode numbers... */ int smbfs_lookup(ap) struct vop_lookup_args /* { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct thread *td = curthread; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct vnode *vp; struct smbmount *smp; struct mount *mp = dvp->v_mount; struct smbnode *dnp; struct smbfattr fattr, *fap; struct smb_cred *scred; char *name = cnp->cn_nameptr; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; int nmlen = cnp->cn_namelen; int error, islastcn, isdot; int killit; SMBVDEBUG("\n"); if (dvp->v_type != VDIR) return ENOTDIR; if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) { SMBFSERR("invalid '..'\n"); return EIO; } islastcn = flags & ISLASTCN; if (islastcn && (mp->mnt_flag & MNT_RDONLY) && (nameiop != LOOKUP)) return EROFS; error = vn_dir_check_exec(dvp, cnp); if (error != 0) return error; smp = VFSTOSMBFS(mp); dnp = VTOSMB(dvp); isdot = (nmlen == 1 && name[0] == '.'); error = smbfs_pathcheck(smp, cnp->cn_nameptr, cnp->cn_namelen, nameiop); if (error) return ENOENT; error = cache_lookup(dvp, vpp, cnp, NULL, NULL); SMBVDEBUG("cache_lookup returned %d\n", error); if (error > 0) return error; if (error) { /* name was found */ struct vattr vattr; killit = 0; vp = *vpp; error = VOP_GETATTR(vp, &vattr, cnp->cn_cred); /* * If the file type on the server is inconsistent * with what it was when we created the vnode, * kill the bogus vnode now and fall through to * the code below to create a new one with the * right type. */ if (error == 0 && ((vp->v_type == VDIR && (VTOSMB(vp)->n_dosattr & SMB_FA_DIR) == 0) || (vp->v_type == VREG && (VTOSMB(vp)->n_dosattr & SMB_FA_DIR) != 0))) killit = 1; else if (error == 0 /* && vattr.va_ctime.tv_sec == VTOSMB(vp)->n_ctime*/) { - if (nameiop != LOOKUP && islastcn) - cnp->cn_flags |= SAVENAME; SMBVDEBUG("use cached vnode\n"); return (0); } cache_purge(vp); /* * XXX This is not quite right, if '.' is * inconsistent, we really need to start the lookup * all over again. Hopefully there is some other * guarantee that prevents this case from happening. */ if (killit && vp != dvp) vgone(vp); if (vp != dvp) vput(vp); else vrele(vp); *vpp = NULLVP; } /* * entry is not in the cache or has been expired */ error = 0; *vpp = NULLVP; scred = smbfs_malloc_scred(); smb_makescred(scred, td, cnp->cn_cred); fap = &fattr; if (flags & ISDOTDOT) { /* * In the DOTDOT case, don't go over-the-wire * in order to request attributes. We already * know it's a directory and subsequent call to * smbfs_getattr() will restore consistency. * */ SMBVDEBUG("smbfs_smb_lookup: dotdot\n"); } else if (isdot) { error = smbfs_smb_lookup(dnp, NULL, 0, fap, scred); SMBVDEBUG("result of smbfs_smb_lookup: %d\n", error); } else { error = smbfs_smb_lookup(dnp, name, nmlen, fap, scred); SMBVDEBUG("result of smbfs_smb_lookup: %d\n", error); } if (error && error != ENOENT) goto out; if (error) { /* entry not found */ /* * Handle RENAME or CREATE case... */ if ((nameiop == CREATE || nameiop == RENAME) && islastcn) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) goto out; - cnp->cn_flags |= SAVENAME; error = EJUSTRETURN; goto out; } error = ENOENT; goto out; }/* else { SMBVDEBUG("Found entry %s with id=%d\n", fap->entryName, fap->dirEntNum); }*/ /* * handle DELETE case ... */ if (nameiop == DELETE && islastcn) { /* delete last component */ error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) goto out; if (isdot) { VREF(dvp); *vpp = dvp; goto out; } error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp); if (error) goto out; *vpp = vp; - cnp->cn_flags |= SAVENAME; goto out; } if (nameiop == RENAME && islastcn) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) goto out; if (isdot) { error = EISDIR; goto out; } error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp); if (error) goto out; *vpp = vp; - cnp->cn_flags |= SAVENAME; goto out; } if (flags & ISDOTDOT) { mp = dvp->v_mount; error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(dvp); error = vfs_busy(mp, 0); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vfs_rel(mp); if (error) { error = ENOENT; goto out; } if (VN_IS_DOOMED(dvp)) { vfs_unbusy(mp); error = ENOENT; goto out; } } VOP_UNLOCK(dvp); error = smbfs_nget(mp, dvp, name, nmlen, NULL, &vp); vfs_unbusy(mp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(dvp)) { if (error == 0) vput(vp); error = ENOENT; } if (error) goto out; *vpp = vp; } else if (isdot) { vref(dvp); *vpp = dvp; } else { error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp); if (error) goto out; *vpp = vp; SMBVDEBUG("lookup: getnewvp!\n"); } if ((cnp->cn_flags & MAKEENTRY)/* && !islastcn*/) { /* VTOSMB(*vpp)->n_ctime = VTOSMB(*vpp)->n_vattr.va_ctime.tv_sec;*/ cache_enter(dvp, *vpp, cnp); } out: smbfs_free_scred(scred); return (error); } diff --git a/sys/fs/tmpfs/tmpfs_subr.c b/sys/fs/tmpfs/tmpfs_subr.c index 7ec5f9bf3ffa..82036a35b4f0 100644 --- a/sys/fs/tmpfs/tmpfs_subr.c +++ b/sys/fs/tmpfs/tmpfs_subr.c @@ -1,2271 +1,2270 @@ /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Efficient memory file system supporting functions. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "tmpfs file system"); static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; MALLOC_DEFINE(M_TMPFSDIR, "tmpfs dir", "tmpfs dirent structure"); static uma_zone_t tmpfs_node_pool; VFS_SMR_DECLARE; int tmpfs_pager_type = -1; static vm_object_t tmpfs_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; MPASS(handle == NULL); MPASS(offset == 0); object = vm_object_allocate_dyn(tmpfs_pager_type, size, OBJ_COLORED | OBJ_SWAP); if (!swap_pager_init_object(object, NULL, NULL, size, 0)) { vm_object_deallocate(object); object = NULL; } return (object); } /* * Make sure tmpfs vnodes with writable mappings can be found on the lazy list. * * This allows for periodic mtime updates while only scanning vnodes which are * plausibly dirty, see tmpfs_update_mtime_lazy. */ static void tmpfs_pager_writecount_recalc(vm_object_t object, vm_offset_t old, vm_offset_t new) { struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); vp = object->un_pager.swp.swp_tmpfs; /* * Forced unmount? */ if (vp == NULL) { KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, ("object %p with OBJ_TMPFS_VREF but without vnode", object)); VM_OBJECT_WUNLOCK(object); return; } if (old == 0) { VNASSERT((object->flags & OBJ_TMPFS_VREF) == 0, vp, ("object without writable mappings has a reference")); VNPASS(vp->v_usecount > 0, vp); } else { VNASSERT((object->flags & OBJ_TMPFS_VREF) != 0, vp, ("object with writable mappings does not have a reference")); } if (old == new) { VM_OBJECT_WUNLOCK(object); return; } if (new == 0) { vm_object_clear_flag(object, OBJ_TMPFS_VREF); VM_OBJECT_WUNLOCK(object); vrele(vp); } else { if ((object->flags & OBJ_TMPFS_VREF) == 0) { vref(vp); vlazy(vp); vm_object_set_flag(object, OBJ_TMPFS_VREF); } VM_OBJECT_WUNLOCK(object); } } static void tmpfs_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { vm_offset_t new, old; VM_OBJECT_WLOCK(object); KASSERT((object->flags & OBJ_ANON) == 0, ("%s: object %p with OBJ_ANON", __func__, object)); old = object->un_pager.swp.writemappings; object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; new = object->un_pager.swp.writemappings; tmpfs_pager_writecount_recalc(object, old, new); VM_OBJECT_ASSERT_UNLOCKED(object); } static void tmpfs_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { vm_offset_t new, old; VM_OBJECT_WLOCK(object); KASSERT((object->flags & OBJ_ANON) == 0, ("%s: object %p with OBJ_ANON", __func__, object)); old = object->un_pager.swp.writemappings; object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; new = object->un_pager.swp.writemappings; tmpfs_pager_writecount_recalc(object, old, new); VM_OBJECT_ASSERT_UNLOCKED(object); } static void tmpfs_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) { struct vnode *vp; /* * Tmpfs VREG node, which was reclaimed, has tmpfs_pager_type * type, but not OBJ_TMPFS flag. In this case there is no * v_writecount to adjust. */ if (vp_heldp != NULL) VM_OBJECT_RLOCK(object); else VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; if (vp != NULL) { *vpp = vp; if (vp_heldp != NULL) { vhold(vp); *vp_heldp = true; } } } if (vp_heldp != NULL) VM_OBJECT_RUNLOCK(object); } struct pagerops tmpfs_pager_ops = { .pgo_kvme_type = KVME_TYPE_VNODE, .pgo_alloc = tmpfs_pager_alloc, .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, .pgo_update_writecount = tmpfs_pager_update_writecount, .pgo_release_writecount = tmpfs_pager_release_writecount, .pgo_mightbedirty = vm_object_mightbedirty_, .pgo_getvp = tmpfs_pager_getvp, }; static int tmpfs_node_ctor(void *mem, int size, void *arg, int flags) { struct tmpfs_node *node; node = mem; node->tn_gen++; node->tn_size = 0; node->tn_status = 0; node->tn_accessed = false; node->tn_flags = 0; node->tn_links = 0; node->tn_vnode = NULL; node->tn_vpstate = 0; return (0); } static void tmpfs_node_dtor(void *mem, int size, void *arg) { struct tmpfs_node *node; node = mem; node->tn_type = VNON; } static int tmpfs_node_init(void *mem, int size, int flags) { struct tmpfs_node *node; node = mem; node->tn_id = 0; mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF); node->tn_gen = arc4random(); return (0); } static void tmpfs_node_fini(void *mem, int size) { struct tmpfs_node *node; node = mem; mtx_destroy(&node->tn_interlock); } int tmpfs_subr_init(void) { tmpfs_pager_type = vm_pager_alloc_dyn_type(&tmpfs_pager_ops, OBJT_SWAP); if (tmpfs_pager_type == -1) return (EINVAL); tmpfs_node_pool = uma_zcreate("TMPFS node", sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); VFS_SMR_ZONE_SET(tmpfs_node_pool); return (0); } void tmpfs_subr_uninit(void) { if (tmpfs_pager_type != -1) vm_pager_free_dyn_type(tmpfs_pager_type); tmpfs_pager_type = -1; uma_zdestroy(tmpfs_node_pool); } static int sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) { int error; long pages, bytes; pages = *(long *)arg1; bytes = pages * PAGE_SIZE; error = sysctl_handle_long(oidp, &bytes, 0, req); if (error || !req->newptr) return (error); pages = bytes / PAGE_SIZE; if (pages < TMPFS_PAGES_MINRESERVED) return (EINVAL); *(long *)arg1 = pages; return (0); } SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &tmpfs_pages_reserved, 0, sysctl_mem_reserved, "L", "Amount of available memory and swap below which tmpfs growth stops"); static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b); RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); size_t tmpfs_mem_avail(void) { size_t avail; long reserved; avail = swap_pager_avail + vm_free_count(); reserved = atomic_load_long(&tmpfs_pages_reserved); if (__predict_false(avail < reserved)) return (0); return (avail - reserved); } size_t tmpfs_pages_used(struct tmpfs_mount *tmp) { const size_t node_size = sizeof(struct tmpfs_node) + sizeof(struct tmpfs_dirent); size_t meta_pages; meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, PAGE_SIZE); return (meta_pages + tmp->tm_pages_used); } static size_t tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) { if (tmpfs_mem_avail() < req_pages) return (0); if (tmp->tm_pages_max != ULONG_MAX && tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) return (0); return (1); } static int tmpfs_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, int end, boolean_t ignerr) { vm_page_t m; int rv, error; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(base >= 0, ("%s: base %d", __func__, base)); KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, end)); error = 0; retry: m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); if (m != NULL) { MPASS(vm_page_all_valid(m)); } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) goto retry; vm_object_pip_add(object, 1); VM_OBJECT_WUNLOCK(object); rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); VM_OBJECT_WLOCK(object); vm_object_pip_wakeup(object); if (rv == VM_PAGER_OK) { /* * Since the page was not resident, and therefore not * recently accessed, immediately enqueue it for * asynchronous laundering. The current operation is * not regarded as an access. */ vm_page_launder(m); } else { vm_page_free(m); m = NULL; if (!ignerr) error = EIO; } } if (m != NULL) { pmap_zero_page_area(m, base, end - base); vm_page_set_dirty(m); vm_page_xunbusy(m); } return (error); } void tmpfs_ref_node(struct tmpfs_node *node) { #ifdef INVARIANTS u_int old; old = #endif refcount_acquire(&node->tn_refcount); #ifdef INVARIANTS KASSERT(old > 0, ("node %p zero refcount", node)); #endif } /* * Allocates a new node of type 'type' inside the 'tmp' mount point, with * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', * using the credentials of the process 'p'. * * If the node type is set to 'VDIR', then the parent parameter must point * to the parent directory of the node being created. It may only be NULL * while allocating the root node. * * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter * specifies the device the node represents. * * If the node type is set to 'VLNK', then the parameter target specifies * the file name of the target file for the symbolic link that is being * created. * * Note that new nodes are retrieved from the available list if it has * items or, if it is empty, from the node pool as long as there is enough * space to create them. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, enum vtype type, uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, const char *target, dev_t rdev, struct tmpfs_node **node) { struct tmpfs_node *nnode; char *symlink; char symlink_smr; /* If the root directory of the 'tmp' file system is not yet * allocated, this must be the request to do it. */ MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); MPASS(IFF(type == VLNK, target != NULL)); MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL)); if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) return (ENOSPC); if (tmpfs_pages_check_avail(tmp, 1) == 0) return (ENOSPC); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { /* * When a new tmpfs node is created for fully * constructed mount point, there must be a parent * node, which vnode is locked exclusively. As * consequence, if the unmount is executing in * parallel, vflush() cannot reclaim the parent vnode. * Due to this, the check for MNTK_UNMOUNT flag is not * racy: if we did not see MNTK_UNMOUNT flag, then tmp * cannot be destroyed until node construction is * finished and the parent vnode unlocked. * * Tmpfs does not need to instantiate new nodes during * unmount. */ return (EBUSY); } if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) return (EROFS); nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); /* Generic initialization. */ nnode->tn_type = type; vfs_timestamp(&nnode->tn_atime); nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = nnode->tn_atime; nnode->tn_uid = uid; nnode->tn_gid = gid; nnode->tn_mode = mode; nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr); nnode->tn_refcount = 1; /* Type-specific initialization. */ switch (nnode->tn_type) { case VBLK: case VCHR: nnode->tn_rdev = rdev; break; case VDIR: RB_INIT(&nnode->tn_dir.tn_dirhead); LIST_INIT(&nnode->tn_dir.tn_dupindex); MPASS(parent != nnode); MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; nnode->tn_dir.tn_readdir_lastn = 0; nnode->tn_dir.tn_readdir_lastp = NULL; nnode->tn_links++; TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); nnode->tn_dir.tn_parent->tn_links++; TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); break; case VFIFO: /* FALLTHROUGH */ case VSOCK: break; case VLNK: MPASS(strlen(target) < MAXPATHLEN); nnode->tn_size = strlen(target); symlink = NULL; if (!tmp->tm_nonc) { symlink = cache_symlink_alloc(nnode->tn_size + 1, M_WAITOK); symlink_smr = true; } if (symlink == NULL) { symlink = malloc(nnode->tn_size + 1, M_TMPFSNAME, M_WAITOK); symlink_smr = false; } memcpy(symlink, target, nnode->tn_size + 1); /* * Allow safe symlink resolving for lockless lookup. * tmpfs_fplookup_symlink references this comment. * * 1. nnode is not yet visible to the world * 2. both tn_link_target and tn_link_smr get populated * 3. release fence publishes their content * 4. tn_link_target content is immutable until node destruction, * where the pointer gets set to NULL * 5. tn_link_smr is never changed once set * * As a result it is sufficient to issue load consume on the node * pointer to also get the above content in a stable manner. * Worst case tn_link_smr flag may be set to true despite being stale, * while the target buffer is already cleared out. */ atomic_store_ptr(&nnode->tn_link_target, symlink); atomic_store_char((char *)&nnode->tn_link_smr, symlink_smr); atomic_thread_fence_rel(); break; case VREG: nnode->tn_reg.tn_aobj = vm_pager_allocate(tmpfs_pager_type, NULL, 0, VM_PROT_DEFAULT, 0, NULL /* XXXKIB - tmpfs needs swap reservation */); /* OBJ_TMPFS is set together with the setting of vp->v_object */ nnode->tn_reg.tn_tmp = tmp; break; default: panic("tmpfs_alloc_node: type %p %d", nnode, (int)nnode->tn_type); } TMPFS_LOCK(tmp); LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); nnode->tn_attached = true; tmp->tm_nodes_inuse++; tmp->tm_refcount++; TMPFS_UNLOCK(tmp); *node = nnode; return (0); } /* * Destroys the node pointed to by node from the file system 'tmp'. * If the node references a directory, no entries are allowed. */ void tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) { if (refcount_release_if_not_last(&node->tn_refcount)) return; TMPFS_LOCK(tmp); TMPFS_NODE_LOCK(node); if (!tmpfs_free_node_locked(tmp, node, false)) { TMPFS_NODE_UNLOCK(node); TMPFS_UNLOCK(tmp); } } bool tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, bool detach) { vm_object_t uobj; char *symlink; bool last; TMPFS_MP_ASSERT_LOCKED(tmp); TMPFS_NODE_ASSERT_LOCKED(node); last = refcount_release(&node->tn_refcount); if (node->tn_attached && (detach || last)) { MPASS(tmp->tm_nodes_inuse > 0); tmp->tm_nodes_inuse--; LIST_REMOVE(node, tn_entries); node->tn_attached = false; } if (!last) return (false); TMPFS_NODE_UNLOCK(node); #ifdef INVARIANTS MPASS(node->tn_vnode == NULL); MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); /* * Make sure this is a node type we can deal with. Everything is explicitly * enumerated without the 'default' clause so the compiler can throw an * error in case a new type is added. */ switch (node->tn_type) { case VBLK: case VCHR: case VDIR: case VFIFO: case VSOCK: case VLNK: case VREG: break; case VNON: case VBAD: case VMARKER: panic("%s: bad type %d for node %p", __func__, (int)node->tn_type, node); } #endif switch (node->tn_type) { case VREG: uobj = node->tn_reg.tn_aobj; if (uobj != NULL) { if (uobj->size != 0) atomic_subtract_long(&tmp->tm_pages_used, uobj->size); } tmpfs_free_tmp(tmp); if (uobj != NULL) { KASSERT((uobj->flags & OBJ_TMPFS) == 0, ("leaked OBJ_TMPFS node %p vm_obj %p", node, uobj)); vm_object_deallocate(uobj); } break; case VLNK: tmpfs_free_tmp(tmp); symlink = node->tn_link_target; atomic_store_ptr(&node->tn_link_target, NULL); if (atomic_load_char(&node->tn_link_smr)) { cache_symlink_free(symlink, node->tn_size + 1); } else { free(symlink, M_TMPFSNAME); } break; default: tmpfs_free_tmp(tmp); break; } uma_zfree_smr(tmpfs_node_pool, node); return (true); } static __inline uint32_t tmpfs_dirent_hash(const char *name, u_int len) { uint32_t hash; hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP hash &= 0xf; #endif if (hash < TMPFS_DIRCOOKIE_MIN) hash += TMPFS_DIRCOOKIE_MIN; return (hash); } static __inline off_t tmpfs_dirent_cookie(struct tmpfs_dirent *de) { if (de == NULL) return (TMPFS_DIRCOOKIE_EOF); MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); return (de->td_cookie); } static __inline boolean_t tmpfs_dirent_dup(struct tmpfs_dirent *de) { return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); } static __inline boolean_t tmpfs_dirent_duphead(struct tmpfs_dirent *de) { return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); } void tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) { de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); memcpy(de->ud.td_name, name, namelen); de->td_namelen = namelen; } /* * Allocates a new directory entry for the node node with a name of name. * The new directory entry is returned in *de. * * The link count of node is increased by one to reflect the new object * referencing it. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, const char *name, u_int len, struct tmpfs_dirent **de) { struct tmpfs_dirent *nde; nde = malloc(sizeof(*nde), M_TMPFSDIR, M_WAITOK); nde->td_node = node; if (name != NULL) { nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); tmpfs_dirent_init(nde, name, len); } else nde->td_namelen = 0; if (node != NULL) node->tn_links++; *de = nde; return (0); } /* * Frees a directory entry. It is the caller's responsibility to destroy * the node referenced by it if needed. * * The link count of node is decreased by one to reflect the removal of an * object that referenced it. This only happens if 'node_exists' is true; * otherwise the function will not access the node referred to by the * directory entry, as it may already have been released from the outside. */ void tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) { struct tmpfs_node *node; node = de->td_node; if (node != NULL) { MPASS(node->tn_links > 0); node->tn_links--; } if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) free(de->ud.td_name, M_TMPFSNAME); free(de, M_TMPFSDIR); } void tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) { bool want_vrele; ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); if (vp->v_type != VREG || obj == NULL) return; VM_OBJECT_WLOCK(obj); VI_LOCK(vp); /* * May be going through forced unmount. */ want_vrele = false; if ((obj->flags & OBJ_TMPFS_VREF) != 0) { vm_object_clear_flag(obj, OBJ_TMPFS_VREF); want_vrele = true; } vm_object_clear_flag(obj, OBJ_TMPFS); obj->un_pager.swp.swp_tmpfs = NULL; if (vp->v_writecount < 0) vp->v_writecount = 0; VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(obj); if (want_vrele) { vrele(vp); } } /* * Allocates a new vnode for the node node or returns a new reference to * an existing one if the node had already a vnode referencing it. The * resulting locked vnode is returned in *vpp. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, struct vnode **vpp) { struct vnode *vp; enum vgetstate vs; struct tmpfs_mount *tm; vm_object_t object; int error; error = 0; tm = VFS_TO_TMPFS(mp); TMPFS_NODE_LOCK(node); tmpfs_ref_node(node); loop: TMPFS_NODE_ASSERT_LOCKED(node); if ((vp = node->tn_vnode) != NULL) { MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || (VN_IS_DOOMED(vp) && (lkflag & LK_NOWAIT) != 0)) { TMPFS_NODE_UNLOCK(node); error = ENOENT; vp = NULL; goto out; } if (VN_IS_DOOMED(vp)) { node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 0, "tmpfsE", 0); } goto loop; } vs = vget_prep(vp); TMPFS_NODE_UNLOCK(node); error = vget_finish(vp, lkflag, vs); if (error == ENOENT) { TMPFS_NODE_LOCK(node); goto loop; } if (error != 0) { vp = NULL; goto out; } /* * Make sure the vnode is still there after * getting the interlock to avoid racing a free. */ if (node->tn_vnode != vp) { vput(vp); TMPFS_NODE_LOCK(node); goto loop; } goto out; } if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { TMPFS_NODE_UNLOCK(node); error = ENOENT; vp = NULL; goto out; } /* * otherwise lock the vp list while we call getnewvnode * since that can block. */ if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { node->tn_vpstate |= TMPFS_VNODE_WANT; error = msleep((caddr_t) &node->tn_vpstate, TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); if (error != 0) goto out; goto loop; } else node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; TMPFS_NODE_UNLOCK(node); /* Get a new vnode and associate it with our node. */ error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); if (error != 0) goto unlock; MPASS(vp != NULL); /* lkflag is ignored, the lock is exclusive */ (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_data = node; vp->v_type = node->tn_type; /* Type-specific initialization. */ switch (node->tn_type) { case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VLNK: /* FALLTHROUGH */ case VSOCK: break; case VFIFO: vp->v_op = &tmpfs_fifoop_entries; break; case VREG: object = node->tn_reg.tn_aobj; VM_OBJECT_WLOCK(object); KASSERT((object->flags & OBJ_TMPFS_VREF) == 0, ("%s: object %p with OBJ_TMPFS_VREF but without vnode", __func__, object)); KASSERT(object->un_pager.swp.writemappings == 0, ("%s: object %p has writemappings", __func__, object)); VI_LOCK(vp); KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); vp->v_object = object; object->un_pager.swp.swp_tmpfs = vp; vm_object_set_flag(object, OBJ_TMPFS); vn_irflag_set_locked(vp, VIRF_PGREAD | VIRF_TEXT_REF); VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(object); break; case VDIR: MPASS(node->tn_dir.tn_parent != NULL); if (node->tn_dir.tn_parent == node) vp->v_vflag |= VV_ROOT; break; default: panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); } if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); error = insmntque1(vp, mp); if (error != 0) { /* Need to clear v_object for insmntque failure. */ tmpfs_destroy_vobject(vp, vp->v_object); vp->v_object = NULL; vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); vp = NULL; } unlock: TMPFS_NODE_LOCK(node); MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; node->tn_vnode = vp; if (node->tn_vpstate & TMPFS_VNODE_WANT) { node->tn_vpstate &= ~TMPFS_VNODE_WANT; TMPFS_NODE_UNLOCK(node); wakeup((caddr_t) &node->tn_vpstate); } else TMPFS_NODE_UNLOCK(node); out: if (error == 0) { *vpp = vp; #ifdef INVARIANTS MPASS(*vpp != NULL && VOP_ISLOCKED(*vpp)); TMPFS_NODE_LOCK(node); MPASS(*vpp == node->tn_vnode); TMPFS_NODE_UNLOCK(node); #endif } tmpfs_free_node(tm, node); return (error); } /* * Destroys the association between the vnode vp and the node it * references. */ void tmpfs_free_vp(struct vnode *vp) { struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); TMPFS_NODE_ASSERT_LOCKED(node); node->tn_vnode = NULL; if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) wakeup(&node->tn_vnode); node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; vp->v_data = NULL; } /* * Allocates a new file of type 'type' and adds it to the parent directory * 'dvp'; this addition is done using the component name given in 'cnp'. * The ownership of the new file is automatically assigned based on the * credentials of the caller (through 'cnp'), the group is set based on * the parent directory and the mode is determined from the 'vap' argument. * If successful, *vpp holds a vnode to the newly created file and zero * is returned. Otherwise *vpp is NULL and the function returns an * appropriate error code. */ int tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, struct componentname *cnp, const char *target) { int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; struct tmpfs_node *parent; ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); - MPASS(cnp->cn_flags & HASBUF); tmp = VFS_TO_TMPFS(dvp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULL; /* If the entry we are creating is a directory, we cannot overflow * the number of links of its parent, because it will get a new * link. */ if (vap->va_type == VDIR) { /* Ensure that we do not overflow the maximum number of links * imposed by the system. */ MPASS(dnode->tn_links <= TMPFS_LINK_MAX); if (dnode->tn_links == TMPFS_LINK_MAX) { return (EMLINK); } parent = dnode; MPASS(parent != NULL); } else parent = NULL; /* Allocate a node that represents the new file. */ error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, target, vap->va_rdev, &node); if (error != 0) return (error); /* Allocate a directory entry that points to the new file. */ error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) { tmpfs_free_node(tmp, node); return (error); } /* Allocate a vnode for the new file. */ error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); if (error != 0) { tmpfs_free_dirent(tmp, de); tmpfs_free_node(tmp, node); return (error); } /* Now that all required items are allocated, we can proceed to * insert the new node into the directory, an operation that * cannot fail. */ if (cnp->cn_flags & ISWHITEOUT) tmpfs_dir_whiteout_remove(dvp, cnp); tmpfs_dir_attach(dvp, de); return (0); } struct tmpfs_dirent * tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) { struct tmpfs_dirent *de; de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); dc->tdc_tree = de; if (de != NULL && tmpfs_dirent_duphead(de)) de = LIST_FIRST(&de->ud.td_duphead); dc->tdc_current = de; return (dc->tdc_current); } struct tmpfs_dirent * tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) { struct tmpfs_dirent *de; MPASS(dc->tdc_tree != NULL); if (tmpfs_dirent_dup(dc->tdc_current)) { dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); if (dc->tdc_current != NULL) return (dc->tdc_current); } dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, dc->tdc_tree); if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); MPASS(dc->tdc_current != NULL); } return (dc->tdc_current); } /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ static struct tmpfs_dirent * tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) { struct tmpfs_dirent *de, dekey; dekey.td_hash = hash; de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); return (de); } /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ static struct tmpfs_dirent * tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, struct tmpfs_dir_cursor *dc) { struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; struct tmpfs_dirent *de, dekey; MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); if (cookie == node->tn_dir.tn_readdir_lastn && (de = node->tn_dir.tn_readdir_lastp) != NULL) { /* Protect against possible race, tn_readdir_last[pn] * may be updated with only shared vnode lock held. */ if (cookie == tmpfs_dirent_cookie(de)) goto out; } if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { LIST_FOREACH(de, &node->tn_dir.tn_dupindex, uh.td_dup.index_entries) { MPASS(tmpfs_dirent_dup(de)); if (de->td_cookie == cookie) goto out; /* dupindex list is sorted. */ if (de->td_cookie < cookie) { de = NULL; goto out; } } MPASS(de == NULL); goto out; } if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { de = NULL; } else { dekey.td_hash = cookie; /* Recover if direntry for cookie was removed */ de = RB_NFIND(tmpfs_dir, dirhead, &dekey); } dc->tdc_tree = de; dc->tdc_current = de; if (de != NULL && tmpfs_dirent_duphead(de)) { dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); MPASS(dc->tdc_current != NULL); } return (dc->tdc_current); out: dc->tdc_tree = de; dc->tdc_current = de; if (de != NULL && tmpfs_dirent_dup(de)) dc->tdc_tree = tmpfs_dir_xlookup_hash(node, de->td_hash); return (dc->tdc_current); } /* * Looks for a directory entry in the directory represented by node. * 'cnp' describes the name of the entry to look for. Note that the . * and .. components are not allowed as they do not physically exist * within directories. * * Returns a pointer to the entry when found, otherwise NULL. */ struct tmpfs_dirent * tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, struct componentname *cnp) { struct tmpfs_dir_duphead *duphead; struct tmpfs_dirent *de; uint32_t hash; MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.'))); TMPFS_VALIDATE_DIR(node); hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); de = tmpfs_dir_xlookup_hash(node, hash); if (de != NULL && tmpfs_dirent_duphead(de)) { duphead = &de->ud.td_duphead; LIST_FOREACH(de, duphead, uh.td_dup.entries) { if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, cnp->cn_namelen)) break; } } else if (de != NULL) { if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, cnp->cn_namelen)) de = NULL; } if (de != NULL && f != NULL && de->td_node != f) de = NULL; return (de); } /* * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex * list, allocate new cookie value. */ static void tmpfs_dir_attach_dup(struct tmpfs_node *dnode, struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) { struct tmpfs_dir_duphead *dupindex; struct tmpfs_dirent *de, *pde; dupindex = &dnode->tn_dir.tn_dupindex; de = LIST_FIRST(dupindex); if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { if (de == NULL) nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; else nde->td_cookie = de->td_cookie + 1; MPASS(tmpfs_dirent_dup(nde)); LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } /* * Cookie numbers are near exhaustion. Scan dupindex list for unused * numbers. dupindex list is sorted in descending order. Keep it so * after inserting nde. */ while (1) { pde = de; de = LIST_NEXT(de, uh.td_dup.index_entries); if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { /* * Last element of the index doesn't have minimal cookie * value, use it. */ nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } else if (de == NULL) { /* * We are so lucky have 2^30 hash duplicates in single * directory :) Return largest possible cookie value. * It should be fine except possible issues with * VOP_READDIR restart. */ nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } if (de->td_cookie + 1 == pde->td_cookie || de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) continue; /* No hole or invalid cookie. */ nde->td_cookie = de->td_cookie + 1; MPASS(tmpfs_dirent_dup(nde)); MPASS(pde->td_cookie > nde->td_cookie); MPASS(nde->td_cookie > de->td_cookie); LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } } /* * Attaches the directory entry de to the directory represented by vp. * Note that this does not change the link count of the node pointed by * the directory entry, as this is done by tmpfs_alloc_dirent. */ void tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_node *dnode; struct tmpfs_dirent *xde, *nde; ASSERT_VOP_ELOCKED(vp, __func__); MPASS(de->td_namelen > 0); MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); MPASS(de->td_cookie == de->td_hash); dnode = VP_TO_TMPFS_DIR(vp); dnode->tn_dir.tn_readdir_lastn = 0; dnode->tn_dir.tn_readdir_lastp = NULL; MPASS(!tmpfs_dirent_dup(de)); xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); if (xde != NULL && tmpfs_dirent_duphead(xde)) tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); else if (xde != NULL) { /* * Allocate new duphead. Swap xde with duphead to avoid * adding/removing elements with the same hash. */ MPASS(!tmpfs_dirent_dup(xde)); tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, &nde); /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ memcpy(nde, xde, sizeof(*xde)); xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; LIST_INIT(&xde->ud.td_duphead); xde->td_namelen = 0; xde->td_node = NULL; tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); } dnode->tn_size += sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; dnode->tn_accessed = true; tmpfs_update(vp); } /* * Detaches the directory entry de from the directory represented by vp. * Note that this does not change the link count of the node pointed by * the directory entry, as this is done by tmpfs_free_dirent. */ void tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_mount *tmp; struct tmpfs_dir *head; struct tmpfs_node *dnode; struct tmpfs_dirent *xde; ASSERT_VOP_ELOCKED(vp, __func__); dnode = VP_TO_TMPFS_DIR(vp); head = &dnode->tn_dir.tn_dirhead; dnode->tn_dir.tn_readdir_lastn = 0; dnode->tn_dir.tn_readdir_lastp = NULL; if (tmpfs_dirent_dup(de)) { /* Remove duphead if de was last entry. */ if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); MPASS(tmpfs_dirent_duphead(xde)); } else xde = NULL; LIST_REMOVE(de, uh.td_dup.entries); LIST_REMOVE(de, uh.td_dup.index_entries); if (xde != NULL) { if (LIST_EMPTY(&xde->ud.td_duphead)) { RB_REMOVE(tmpfs_dir, head, xde); tmp = VFS_TO_TMPFS(vp->v_mount); MPASS(xde->td_node == NULL); tmpfs_free_dirent(tmp, xde); } } de->td_cookie = de->td_hash; } else RB_REMOVE(tmpfs_dir, head, de); dnode->tn_size -= sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; dnode->tn_accessed = true; tmpfs_update(vp); } void tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) { struct tmpfs_dirent *de, *dde, *nde; RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); /* Node may already be destroyed. */ de->td_node = NULL; if (tmpfs_dirent_duphead(de)) { while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { LIST_REMOVE(dde, uh.td_dup.entries); dde->td_node = NULL; tmpfs_free_dirent(tmp, dde); } } tmpfs_free_dirent(tmp, de); } } /* * Helper function for tmpfs_readdir. Creates a '.' entry for the given * directory and returns it in the uio space. The function returns 0 * on success, -1 if there was not enough space in the uio structure to * hold the directory entry or an appropriate error code if another * error happens. */ static int tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, struct uio *uio) { int error; struct dirent dent; TMPFS_VALIDATE_DIR(node); MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); dent.d_fileno = node->tn_id; dent.d_off = TMPFS_DIRCOOKIE_DOTDOT; dent.d_type = DT_DIR; dent.d_namlen = 1; dent.d_name[0] = '.'; dent.d_reclen = GENERIC_DIRSIZ(&dent); dirent_terminate(&dent); if (dent.d_reclen > uio->uio_resid) error = EJUSTRETURN; else error = uiomove(&dent, dent.d_reclen, uio); tmpfs_set_accessed(tm, node); return (error); } /* * Helper function for tmpfs_readdir. Creates a '..' entry for the given * directory and returns it in the uio space. The function returns 0 * on success, -1 if there was not enough space in the uio structure to * hold the directory entry or an appropriate error code if another * error happens. */ static int tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, struct uio *uio, off_t next) { struct tmpfs_node *parent; struct dirent dent; int error; TMPFS_VALIDATE_DIR(node); MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); /* * Return ENOENT if the current node is already removed. */ TMPFS_ASSERT_LOCKED(node); parent = node->tn_dir.tn_parent; if (parent == NULL) return (ENOENT); dent.d_fileno = parent->tn_id; dent.d_off = next; dent.d_type = DT_DIR; dent.d_namlen = 2; dent.d_name[0] = '.'; dent.d_name[1] = '.'; dent.d_reclen = GENERIC_DIRSIZ(&dent); dirent_terminate(&dent); if (dent.d_reclen > uio->uio_resid) error = EJUSTRETURN; else error = uiomove(&dent, dent.d_reclen, uio); tmpfs_set_accessed(tm, node); return (error); } /* * Helper function for tmpfs_readdir. Returns as much directory entries * as can fit in the uio space. The read starts at uio->uio_offset. * The function returns 0 on success, -1 if there was not enough space * in the uio structure to hold the directory entry or an appropriate * error code if another error happens. */ int tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node, struct uio *uio, int maxcookies, uint64_t *cookies, int *ncookies) { struct tmpfs_dir_cursor dc; struct tmpfs_dirent *de, *nde; off_t off; int error; TMPFS_VALIDATE_DIR(node); off = 0; /* * Lookup the node from the current offset. The starting offset of * 0 will lookup both '.' and '..', and then the first real entry, * or EOF if there are none. Then find all entries for the dir that * fit into the buffer. Once no more entries are found (de == NULL), * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next * call to return 0. */ switch (uio->uio_offset) { case TMPFS_DIRCOOKIE_DOT: error = tmpfs_dir_getdotdent(tm, node, uio); if (error != 0) return (error); uio->uio_offset = off = TMPFS_DIRCOOKIE_DOTDOT; if (cookies != NULL) cookies[(*ncookies)++] = off; /* FALLTHROUGH */ case TMPFS_DIRCOOKIE_DOTDOT: de = tmpfs_dir_first(node, &dc); off = tmpfs_dirent_cookie(de); error = tmpfs_dir_getdotdotdent(tm, node, uio, off); if (error != 0) return (error); uio->uio_offset = off; if (cookies != NULL) cookies[(*ncookies)++] = off; /* EOF. */ if (de == NULL) return (0); break; case TMPFS_DIRCOOKIE_EOF: return (0); default: de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); if (de == NULL) return (EINVAL); if (cookies != NULL) off = tmpfs_dirent_cookie(de); } /* * Read as much entries as possible; i.e., until we reach the end of the * directory or we exhaust uio space. */ do { struct dirent d; /* * Create a dirent structure representing the current tmpfs_node * and fill it. */ if (de->td_node == NULL) { d.d_fileno = 1; d.d_type = DT_WHT; } else { d.d_fileno = de->td_node->tn_id; switch (de->td_node->tn_type) { case VBLK: d.d_type = DT_BLK; break; case VCHR: d.d_type = DT_CHR; break; case VDIR: d.d_type = DT_DIR; break; case VFIFO: d.d_type = DT_FIFO; break; case VLNK: d.d_type = DT_LNK; break; case VREG: d.d_type = DT_REG; break; case VSOCK: d.d_type = DT_SOCK; break; default: panic("tmpfs_dir_getdents: type %p %d", de->td_node, (int)de->td_node->tn_type); } } d.d_namlen = de->td_namelen; MPASS(de->td_namelen < sizeof(d.d_name)); (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); d.d_reclen = GENERIC_DIRSIZ(&d); /* * Stop reading if the directory entry we are treating is bigger * than the amount of data that can be returned. */ if (d.d_reclen > uio->uio_resid) { error = EJUSTRETURN; break; } nde = tmpfs_dir_next(node, &dc); d.d_off = tmpfs_dirent_cookie(nde); dirent_terminate(&d); /* * Copy the new dirent structure into the output buffer and * advance pointers. */ error = uiomove(&d, d.d_reclen, uio); if (error == 0) { de = nde; if (cookies != NULL) { off = tmpfs_dirent_cookie(de); MPASS(*ncookies < maxcookies); cookies[(*ncookies)++] = off; } } } while (error == 0 && uio->uio_resid > 0 && de != NULL); /* Skip setting off when using cookies as it is already done above. */ if (cookies == NULL) off = tmpfs_dirent_cookie(de); /* Update the offset and cache. */ uio->uio_offset = off; node->tn_dir.tn_readdir_lastn = off; node->tn_dir.tn_readdir_lastp = de; tmpfs_set_accessed(tm, node); return (error); } int tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) { struct tmpfs_dirent *de; int error; error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) return (error); tmpfs_dir_attach(dvp, de); return (0); } void tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) { struct tmpfs_dirent *de; de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); MPASS(de != NULL && de->td_node == NULL); tmpfs_dir_detach(dvp, de); tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); } /* * Resizes the aobj associated with the regular file pointed to by 'vp' to the * size 'newsize'. 'vp' must point to a vnode that represents a regular file. * 'newsize' must be positive. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) { struct tmpfs_mount *tmp; struct tmpfs_node *node; vm_object_t uobj; vm_pindex_t idx, newpages, oldpages; off_t oldsize; int base, error; MPASS(vp->v_type == VREG); MPASS(newsize >= 0); node = VP_TO_TMPFS_NODE(vp); uobj = node->tn_reg.tn_aobj; tmp = VFS_TO_TMPFS(vp->v_mount); /* * Convert the old and new sizes to the number of pages needed to * store them. It may happen that we do not need to do anything * because the last allocated page can accommodate the change on * its own. */ oldsize = node->tn_size; oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); MPASS(oldpages == uobj->size); newpages = OFF_TO_IDX(newsize + PAGE_MASK); if (__predict_true(newpages == oldpages && newsize >= oldsize)) { node->tn_size = newsize; return (0); } if (newpages > oldpages && tmpfs_pages_check_avail(tmp, newpages - oldpages) == 0) return (ENOSPC); VM_OBJECT_WLOCK(uobj); if (newsize < oldsize) { /* * Zero the truncated part of the last page. */ base = newsize & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(newsize); error = tmpfs_partial_page_invalidate(uobj, idx, base, PAGE_SIZE, ignerr); if (error != 0) { VM_OBJECT_WUNLOCK(uobj); return (error); } } /* * Release any swap space and free any whole pages. */ if (newpages < oldpages) vm_object_page_remove(uobj, newpages, 0, 0); } uobj->size = newpages; VM_OBJECT_WUNLOCK(uobj); atomic_add_long(&tmp->tm_pages_used, newpages - oldpages); node->tn_size = newsize; return (0); } /* * Punch hole in the aobj associated with the regular file pointed to by 'vp'. * Requests completely beyond the end-of-file are converted to no-op. * * Returns 0 on success or error code from tmpfs_partial_page_invalidate() on * failure. */ int tmpfs_reg_punch_hole(struct vnode *vp, off_t *offset, off_t *length) { struct tmpfs_node *node; vm_object_t object; vm_pindex_t pistart, pi, piend; int startofs, endofs, end; off_t off, len; int error; KASSERT(*length <= OFF_MAX - *offset, ("%s: offset + length overflows", __func__)); node = VP_TO_TMPFS_NODE(vp); KASSERT(node->tn_type == VREG, ("%s: node is not regular file", __func__)); object = node->tn_reg.tn_aobj; off = *offset; len = omin(node->tn_size - off, *length); startofs = off & PAGE_MASK; endofs = (off + len) & PAGE_MASK; pistart = OFF_TO_IDX(off); piend = OFF_TO_IDX(off + len); pi = OFF_TO_IDX((vm_ooffset_t)off + PAGE_MASK); error = 0; /* Handle the case when offset is on or beyond file size. */ if (len <= 0) { *length = 0; return (0); } VM_OBJECT_WLOCK(object); /* * If there is a partial page at the beginning of the hole-punching * request, fill the partial page with zeroes. */ if (startofs != 0) { end = pistart != piend ? PAGE_SIZE : endofs; error = tmpfs_partial_page_invalidate(object, pistart, startofs, end, FALSE); if (error != 0) goto out; off += end - startofs; len -= end - startofs; } /* * Toss away the full pages in the affected area. */ if (pi < piend) { vm_object_page_remove(object, pi, piend, 0); off += IDX_TO_OFF(piend - pi); len -= IDX_TO_OFF(piend - pi); } /* * If there is a partial page at the end of the hole-punching request, * fill the partial page with zeroes. */ if (endofs != 0 && pistart != piend) { error = tmpfs_partial_page_invalidate(object, piend, 0, endofs, FALSE); if (error != 0) goto out; off += endofs; len -= endofs; } out: VM_OBJECT_WUNLOCK(object); *offset = off; *length = len; return (error); } void tmpfs_check_mtime(struct vnode *vp) { struct tmpfs_node *node; struct vm_object *obj; ASSERT_VOP_ELOCKED(vp, "check_mtime"); if (vp->v_type != VREG) return; obj = vp->v_object; KASSERT(obj->type == tmpfs_pager_type && (obj->flags & (OBJ_SWAP | OBJ_TMPFS)) == (OBJ_SWAP | OBJ_TMPFS), ("non-tmpfs obj")); /* unlocked read */ if (obj->generation != obj->cleangeneration) { VM_OBJECT_WLOCK(obj); if (obj->generation != obj->cleangeneration) { obj->cleangeneration = obj->generation; node = VP_TO_TMPFS_NODE(vp); node->tn_status |= TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED; } VM_OBJECT_WUNLOCK(obj); } } /* * Change flags of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chflags"); node = VP_TO_TMPFS_NODE(vp); if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | UF_SPARSE | UF_SYSTEM)) != 0) return (EOPNOTSUPP); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } } else { if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || ((flags ^ node->tn_flags) & SF_SETTABLE)) return (EPERM); } node->tn_flags = flags; node->tn_status |= TMPFS_NODE_CHANGED; ASSERT_VOP_ELOCKED(vp, "chflags2"); return (0); } /* * Change access mode on the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; mode_t newmode; ASSERT_VOP_ELOCKED(vp, "chmod"); ASSERT_VOP_IN_SEQC(vp); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return (EFTYPE); } if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error) return (error); } newmode = node->tn_mode & ~ALLPERMS; newmode |= mode & ALLPERMS; atomic_store_short(&node->tn_mode, newmode); node->tn_status |= TMPFS_NODE_CHANGED; ASSERT_VOP_ELOCKED(vp, "chmod2"); return (0); } /* * Change ownership of the given vnode. At least one of uid or gid must * be different than VNOVAL. If one is set to that value, the attribute * is unchanged. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; uid_t ouid; gid_t ogid; mode_t newmode; ASSERT_VOP_ELOCKED(vp, "chown"); ASSERT_VOP_IN_SEQC(vp); node = VP_TO_TMPFS_NODE(vp); /* Assign default values if they are unknown. */ MPASS(uid != VNOVAL || gid != VNOVAL); if (uid == VNOVAL) uid = node->tn_uid; if (gid == VNOVAL) gid = node->tn_gid; MPASS(uid != VNOVAL && gid != VNOVAL); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if ((uid != node->tn_uid || (gid != node->tn_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) return (error); ogid = node->tn_gid; ouid = node->tn_uid; node->tn_uid = uid; node->tn_gid = gid; node->tn_status |= TMPFS_NODE_CHANGED; if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { newmode = node->tn_mode & ~(S_ISUID | S_ISGID); atomic_store_short(&node->tn_mode, newmode); } } ASSERT_VOP_ELOCKED(vp, "chown2"); return (0); } /* * Change size of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chsize"); node = VP_TO_TMPFS_NODE(vp); /* Decide whether this is a valid operation based on the file type. */ error = 0; switch (vp->v_type) { case VDIR: return (EISDIR); case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VFIFO: /* * Allow modifications of special files even if in the file * system is mounted read-only (we are not modifying the * files themselves, but the objects they represent). */ return (0); default: /* Anything else is unsupported. */ return (EOPNOTSUPP); } /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return (EPERM); error = tmpfs_truncate(vp, size); /* * tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents * for us, as will update tn_status; no need to do that here. */ ASSERT_VOP_ELOCKED(vp, "chsize2"); return (error); } /* * Change access and modification times of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chtimes(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *l) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chtimes"); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return (EPERM); error = vn_utimes_perm(vp, vap, cred, l); if (error != 0) return (error); if (vap->va_atime.tv_sec != VNOVAL) node->tn_accessed = true; if (vap->va_mtime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_MODIFIED; if (vap->va_birthtime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_MODIFIED; tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); if (vap->va_birthtime.tv_sec != VNOVAL) node->tn_birthtime = vap->va_birthtime; ASSERT_VOP_ELOCKED(vp, "chtimes2"); return (0); } void tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status) { if ((node->tn_status & status) == status || tm->tm_ronly) return; TMPFS_NODE_LOCK(node); node->tn_status |= status; TMPFS_NODE_UNLOCK(node); } void tmpfs_set_accessed(struct tmpfs_mount *tm, struct tmpfs_node *node) { if (node->tn_accessed || tm->tm_ronly) return; atomic_store_8(&node->tn_accessed, true); } /* Sync timestamps */ void tmpfs_itimes(struct vnode *vp, const struct timespec *acc, const struct timespec *mod) { struct tmpfs_node *node; struct timespec now; ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); node = VP_TO_TMPFS_NODE(vp); if (!node->tn_accessed && (node->tn_status & (TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) return; vfs_timestamp(&now); TMPFS_NODE_LOCK(node); if (node->tn_accessed) { if (acc == NULL) acc = &now; node->tn_atime = *acc; } if (node->tn_status & TMPFS_NODE_MODIFIED) { if (mod == NULL) mod = &now; node->tn_mtime = *mod; } if (node->tn_status & TMPFS_NODE_CHANGED) node->tn_ctime = now; node->tn_status &= ~(TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); node->tn_accessed = false; TMPFS_NODE_UNLOCK(node); /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME); } int tmpfs_truncate(struct vnode *vp, off_t length) { int error; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); if (length < 0) { error = EINVAL; goto out; } if (node->tn_size == length) { error = 0; goto out; } if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) return (EFBIG); error = tmpfs_reg_resize(vp, length, FALSE); if (error == 0) node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; out: tmpfs_update(vp); return (error); } static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) { if (a->td_hash > b->td_hash) return (1); else if (a->td_hash < b->td_hash) return (-1); return (0); } RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c index bdad78f66ea5..8077689caeac 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -1,1882 +1,1872 @@ /* $NetBSD: tmpfs_vnops.c,v 1.39 2007/07/23 15:41:01 jmmv Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * tmpfs vnode interface. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_tmpfs); VFS_SMR_DECLARE; static volatile int tmpfs_rename_restarts; SYSCTL_INT(_vfs_tmpfs, OID_AUTO, rename_restarts, CTLFLAG_RD, __DEVOLATILE(int *, &tmpfs_rename_restarts), 0, "Times rename had to restart due to lock contention"); static int tmpfs_vn_get_ino_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { return (tmpfs_alloc_vp(mp, arg, lkflags, rvp)); } static int tmpfs_lookup1(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct tmpfs_dirent *de; struct tmpfs_node *dnode, *pnode; struct tmpfs_mount *tm; int error; /* Caller assumes responsibility for ensuring access (VEXEC). */ dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULLVP; /* We cannot be requesting the parent directory of the root node. */ MPASS(IMPLIES(dnode->tn_type == VDIR && dnode->tn_dir.tn_parent == dnode, !(cnp->cn_flags & ISDOTDOT))); TMPFS_ASSERT_LOCKED(dnode); if (dnode->tn_dir.tn_parent == NULL) { error = ENOENT; goto out; } if (cnp->cn_flags & ISDOTDOT) { tm = VFS_TO_TMPFS(dvp->v_mount); pnode = dnode->tn_dir.tn_parent; tmpfs_ref_node(pnode); error = vn_vget_ino_gen(dvp, tmpfs_vn_get_ino_alloc, pnode, cnp->cn_lkflags, vpp); tmpfs_free_node(tm, pnode); if (error != 0) goto out; } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { VREF(dvp); *vpp = dvp; error = 0; } else { de = tmpfs_dir_lookup(dnode, NULL, cnp); if (de != NULL && de->td_node == NULL) cnp->cn_flags |= ISWHITEOUT; if (de == NULL || de->td_node == NULL) { /* * The entry was not found in the directory. * This is OK if we are creating or renaming an * entry and are working on the last component of * the path name. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == CREATE || \ cnp->cn_nameiop == RENAME || (cnp->cn_nameiop == DELETE && cnp->cn_flags & DOWHITEOUT && cnp->cn_flags & ISWHITEOUT))) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, curthread); if (error != 0) goto out; - /* - * Keep the component name in the buffer for - * future uses. - */ - cnp->cn_flags |= SAVENAME; - error = EJUSTRETURN; } else error = ENOENT; } else { struct tmpfs_node *tnode; /* * The entry was found, so get its associated * tmpfs_node. */ tnode = de->td_node; /* * If we are not at the last path component and * found a non-directory or non-link entry (which * may itself be pointing to a directory), raise * an error. */ if ((tnode->tn_type != VDIR && tnode->tn_type != VLNK) && !(cnp->cn_flags & ISLASTCN)) { error = ENOTDIR; goto out; } /* * If we are deleting or renaming the entry, keep * track of its tmpfs_dirent so that it can be * easily deleted later. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, curthread); if (error != 0) goto out; /* Allocate a new vnode on the matching entry. */ error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp); if (error != 0) goto out; if ((dnode->tn_mode & S_ISTXT) && VOP_ACCESS(dvp, VADMIN, cnp->cn_cred, curthread) && VOP_ACCESS(*vpp, VADMIN, cnp->cn_cred, curthread)) { error = EPERM; vput(*vpp); *vpp = NULL; goto out; } - cnp->cn_flags |= SAVENAME; } else { error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp); if (error != 0) goto out; } } } /* * Store the result of this lookup in the cache. Avoid this if the * request was for creation, as it does not improve timings on * emprical tests. */ if ((cnp->cn_flags & MAKEENTRY) != 0 && tmpfs_use_nc(dvp)) cache_enter(dvp, *vpp, cnp); out: /* * If there were no errors, *vpp cannot be null and it must be * locked. */ MPASS(IFF(error == 0, *vpp != NULLVP && VOP_ISLOCKED(*vpp))); return (error); } static int tmpfs_cached_lookup(struct vop_cachedlookup_args *v) { return (tmpfs_lookup1(v->a_dvp, v->a_vpp, v->a_cnp)); } static int tmpfs_lookup(struct vop_lookup_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; int error; /* Check accessibility of requested node as a first step. */ error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); return (tmpfs_lookup1(dvp, vpp, cnp)); } static int tmpfs_create(struct vop_create_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; int error; MPASS(vap->va_type == VREG || vap->va_type == VSOCK); error = tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL); if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0 && tmpfs_use_nc(dvp)) cache_enter(dvp, *vpp, cnp); return (error); } static int tmpfs_mknod(struct vop_mknod_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; if (vap->va_type != VBLK && vap->va_type != VCHR && vap->va_type != VFIFO) return (EINVAL); return (tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL)); } struct fileops tmpfs_fnops; static int tmpfs_open(struct vop_open_args *v) { struct vnode *vp; struct tmpfs_node *node; struct file *fp; int error, mode; vp = v->a_vp; mode = v->a_mode; node = VP_TO_TMPFS_NODE(vp); /* * The file is still active but all its names have been removed * (e.g. by a "rmdir $(pwd)"). It cannot be opened any more as * it is about to die. */ if (node->tn_links < 1) return (ENOENT); /* If the file is marked append-only, deny write requests. */ if (node->tn_flags & APPEND && (mode & (FWRITE | O_APPEND)) == FWRITE) error = EPERM; else { error = 0; /* For regular files, the call below is nop. */ KASSERT(vp->v_type != VREG || (node->tn_reg.tn_aobj->flags & OBJ_DEAD) == 0, ("dead object")); vnode_create_vobject(vp, node->tn_size, v->a_td); } fp = v->a_fp; MPASS(fp == NULL || fp->f_data == NULL); if (error == 0 && fp != NULL && vp->v_type == VREG) { tmpfs_ref_node(node); finit_vnode(fp, mode, node, &tmpfs_fnops); } return (error); } static int tmpfs_close(struct vop_close_args *v) { struct vnode *vp = v->a_vp; /* Update node times. */ tmpfs_update(vp); return (0); } int tmpfs_fo_close(struct file *fp, struct thread *td) { struct tmpfs_node *node; node = fp->f_data; if (node != NULL) { MPASS(node->tn_type == VREG); tmpfs_free_node(node->tn_reg.tn_tmp, node); } return (vnops.fo_close(fp, td)); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ int tmpfs_fplookup_vexec(struct vop_fplookup_vexec_args *v) { struct vnode *vp; struct tmpfs_node *node; struct ucred *cred; mode_t all_x, mode; vp = v->a_vp; node = VP_TO_TMPFS_NODE_SMR(vp); if (__predict_false(node == NULL)) return (EAGAIN); all_x = S_IXUSR | S_IXGRP | S_IXOTH; mode = atomic_load_short(&node->tn_mode); if (__predict_true((mode & all_x) == all_x)) return (0); cred = v->a_cred; return (vaccess_vexec_smr(mode, node->tn_uid, node->tn_gid, cred)); } int tmpfs_access(struct vop_access_args *v) { struct vnode *vp = v->a_vp; accmode_t accmode = v->a_accmode; struct ucred *cred = v->a_cred; mode_t all_x = S_IXUSR | S_IXGRP | S_IXOTH; int error; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp)); node = VP_TO_TMPFS_NODE(vp); /* * Common case path lookup. */ if (__predict_true(accmode == VEXEC && (node->tn_mode & all_x) == all_x)) return (0); switch (vp->v_type) { case VDIR: /* FALLTHROUGH */ case VLNK: /* FALLTHROUGH */ case VREG: if (accmode & VWRITE && vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } break; case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VSOCK: /* FALLTHROUGH */ case VFIFO: break; default: error = EINVAL; goto out; } if (accmode & VWRITE && node->tn_flags & IMMUTABLE) { error = EPERM; goto out; } error = vaccess(vp->v_type, node->tn_mode, node->tn_uid, node->tn_gid, accmode, cred); out: MPASS(VOP_ISLOCKED(vp)); return (error); } int tmpfs_stat(struct vop_stat_args *v) { struct vnode *vp = v->a_vp; struct stat *sb = v->a_sb; vm_object_t obj; struct tmpfs_node *node; int error; node = VP_TO_TMPFS_NODE(vp); tmpfs_update_getattr(vp); error = vop_stat_helper_pre(v); if (__predict_false(error)) return (error); sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = node->tn_id; sb->st_mode = node->tn_mode | VTTOIF(vp->v_type); sb->st_nlink = node->tn_links; sb->st_uid = node->tn_uid; sb->st_gid = node->tn_gid; sb->st_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ? node->tn_rdev : NODEV; sb->st_size = node->tn_size; sb->st_atim.tv_sec = node->tn_atime.tv_sec; sb->st_atim.tv_nsec = node->tn_atime.tv_nsec; sb->st_mtim.tv_sec = node->tn_mtime.tv_sec; sb->st_mtim.tv_nsec = node->tn_mtime.tv_nsec; sb->st_ctim.tv_sec = node->tn_ctime.tv_sec; sb->st_ctim.tv_nsec = node->tn_ctime.tv_nsec; sb->st_birthtim.tv_sec = node->tn_birthtime.tv_sec; sb->st_birthtim.tv_nsec = node->tn_birthtime.tv_nsec; sb->st_blksize = PAGE_SIZE; sb->st_flags = node->tn_flags; sb->st_gen = node->tn_gen; if (vp->v_type == VREG) { obj = node->tn_reg.tn_aobj; sb->st_blocks = (u_quad_t)obj->resident_page_count * PAGE_SIZE; } else sb->st_blocks = node->tn_size; sb->st_blocks /= S_BLKSIZE; return (vop_stat_helper_post(v, error)); } int tmpfs_getattr(struct vop_getattr_args *v) { struct vnode *vp = v->a_vp; struct vattr *vap = v->a_vap; vm_object_t obj; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); tmpfs_update_getattr(vp); vap->va_type = vp->v_type; vap->va_mode = node->tn_mode; vap->va_nlink = node->tn_links; vap->va_uid = node->tn_uid; vap->va_gid = node->tn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = node->tn_id; vap->va_size = node->tn_size; vap->va_blocksize = PAGE_SIZE; vap->va_atime = node->tn_atime; vap->va_mtime = node->tn_mtime; vap->va_ctime = node->tn_ctime; vap->va_birthtime = node->tn_birthtime; vap->va_gen = node->tn_gen; vap->va_flags = node->tn_flags; vap->va_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ? node->tn_rdev : NODEV; if (vp->v_type == VREG) { obj = node->tn_reg.tn_aobj; vap->va_bytes = (u_quad_t)obj->resident_page_count * PAGE_SIZE; } else vap->va_bytes = node->tn_size; vap->va_filerev = 0; return (0); } int tmpfs_setattr(struct vop_setattr_args *v) { struct vnode *vp = v->a_vp; struct vattr *vap = v->a_vap; struct ucred *cred = v->a_cred; struct thread *td = curthread; int error; MPASS(VOP_ISLOCKED(vp)); ASSERT_VOP_IN_SEQC(vp); error = 0; /* Abort if any unsettable attribute is given. */ if (vap->va_type != VNON || vap->va_nlink != VNOVAL || vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL || vap->va_blocksize != VNOVAL || vap->va_gen != VNOVAL || vap->va_rdev != VNOVAL || vap->va_bytes != VNOVAL) error = EINVAL; if (error == 0 && (vap->va_flags != VNOVAL)) error = tmpfs_chflags(vp, vap->va_flags, cred, td); if (error == 0 && (vap->va_size != VNOVAL)) error = tmpfs_chsize(vp, vap->va_size, cred, td); if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL)) error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred, td); if (error == 0 && (vap->va_mode != (mode_t)VNOVAL)) error = tmpfs_chmod(vp, vap->va_mode, cred, td); if (error == 0 && ((vap->va_atime.tv_sec != VNOVAL && vap->va_atime.tv_nsec != VNOVAL) || (vap->va_mtime.tv_sec != VNOVAL && vap->va_mtime.tv_nsec != VNOVAL) || (vap->va_birthtime.tv_sec != VNOVAL && vap->va_birthtime.tv_nsec != VNOVAL))) error = tmpfs_chtimes(vp, vap, cred, td); /* * Update the node times. We give preference to the error codes * generated by this function rather than the ones that may arise * from tmpfs_update. */ tmpfs_update(vp); MPASS(VOP_ISLOCKED(vp)); return (error); } static int tmpfs_read(struct vop_read_args *v) { struct vnode *vp; struct uio *uio; struct tmpfs_node *node; vp = v->a_vp; if (vp->v_type != VREG) return (EISDIR); uio = v->a_uio; if (uio->uio_offset < 0) return (EINVAL); node = VP_TO_TMPFS_NODE(vp); tmpfs_set_accessed(VFS_TO_TMPFS(vp->v_mount), node); return (uiomove_object(node->tn_reg.tn_aobj, node->tn_size, uio)); } static int tmpfs_read_pgcache(struct vop_read_pgcache_args *v) { struct vnode *vp; struct tmpfs_node *node; vm_object_t object; off_t size; int error; vp = v->a_vp; VNPASS((vn_irflag_read(vp) & VIRF_PGREAD) != 0, vp); if (v->a_uio->uio_offset < 0) return (EINVAL); error = EJUSTRETURN; vfs_smr_enter(); node = VP_TO_TMPFS_NODE_SMR(vp); if (node == NULL) goto out_smr; MPASS(node->tn_type == VREG); MPASS(node->tn_refcount >= 1); object = node->tn_reg.tn_aobj; if (object == NULL) goto out_smr; MPASS(object->type == tmpfs_pager_type); MPASS((object->flags & (OBJ_ANON | OBJ_DEAD | OBJ_SWAP)) == OBJ_SWAP); if (!VN_IS_DOOMED(vp)) { /* size cannot become shorter due to rangelock. */ size = node->tn_size; tmpfs_set_accessed(node->tn_reg.tn_tmp, node); vfs_smr_exit(); error = uiomove_object(object, size, v->a_uio); return (error); } out_smr: vfs_smr_exit(); return (error); } static int tmpfs_write(struct vop_write_args *v) { struct vnode *vp; struct uio *uio; struct tmpfs_node *node; off_t oldsize; int error, ioflag; mode_t newmode; vp = v->a_vp; uio = v->a_uio; ioflag = v->a_ioflag; error = 0; node = VP_TO_TMPFS_NODE(vp); oldsize = node->tn_size; if (uio->uio_offset < 0 || vp->v_type != VREG) return (EINVAL); if (uio->uio_resid == 0) return (0); if (ioflag & IO_APPEND) uio->uio_offset = node->tn_size; if (uio->uio_offset + uio->uio_resid > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) return (EFBIG); if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); if (uio->uio_offset + uio->uio_resid > node->tn_size) { error = tmpfs_reg_resize(vp, uio->uio_offset + uio->uio_resid, FALSE); if (error != 0) goto out; } error = uiomove_object(node->tn_reg.tn_aobj, node->tn_size, uio); node->tn_status |= TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED; node->tn_accessed = true; if (node->tn_mode & (S_ISUID | S_ISGID)) { if (priv_check_cred(v->a_cred, PRIV_VFS_RETAINSUGID)) { newmode = node->tn_mode & ~(S_ISUID | S_ISGID); vn_seqc_write_begin(vp); atomic_store_short(&node->tn_mode, newmode); vn_seqc_write_end(vp); } } if (error != 0) (void)tmpfs_reg_resize(vp, oldsize, TRUE); out: MPASS(IMPLIES(error == 0, uio->uio_resid == 0)); MPASS(IMPLIES(error != 0, oldsize == node->tn_size)); return (error); } static int tmpfs_deallocate(struct vop_deallocate_args *v) { return (tmpfs_reg_punch_hole(v->a_vp, v->a_offset, v->a_len)); } static int tmpfs_fsync(struct vop_fsync_args *v) { struct vnode *vp = v->a_vp; MPASS(VOP_ISLOCKED(vp)); tmpfs_check_mtime(vp); tmpfs_update(vp); return (0); } static int tmpfs_remove(struct vop_remove_args *v) { struct vnode *dvp = v->a_dvp; struct vnode *vp = v->a_vp; int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp)); MPASS(VOP_ISLOCKED(vp)); if (vp->v_type == VDIR) { error = EISDIR; goto out; } dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); de = tmpfs_dir_lookup(dnode, node, v->a_cnp); MPASS(de != NULL); /* Files marked as immutable or append-only cannot be deleted. */ if ((node->tn_flags & (IMMUTABLE | APPEND | NOUNLINK)) || (dnode->tn_flags & APPEND)) { error = EPERM; goto out; } /* Remove the entry from the directory; as it is a file, we do not * have to change the number of hard links of the directory. */ tmpfs_dir_detach(dvp, de); if (v->a_cnp->cn_flags & DOWHITEOUT) tmpfs_dir_whiteout_add(dvp, v->a_cnp); /* Free the directory entry we just deleted. Note that the node * referred by it will not be removed until the vnode is really * reclaimed. */ tmpfs_free_dirent(tmp, de); node->tn_status |= TMPFS_NODE_CHANGED; node->tn_accessed = true; error = 0; out: return (error); } static int tmpfs_link(struct vop_link_args *v) { struct vnode *dvp = v->a_tdvp; struct vnode *vp = v->a_vp; struct componentname *cnp = v->a_cnp; int error; struct tmpfs_dirent *de; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp)); - MPASS(cnp->cn_flags & HASBUF); MPASS(dvp != vp); /* XXX When can this be false? */ node = VP_TO_TMPFS_NODE(vp); /* Ensure that we do not overflow the maximum number of links imposed * by the system. */ MPASS(node->tn_links <= TMPFS_LINK_MAX); if (node->tn_links == TMPFS_LINK_MAX) { error = EMLINK; goto out; } /* We cannot create links of files marked immutable or append-only. */ if (node->tn_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } /* Allocate a new directory entry to represent the node. */ error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), node, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) goto out; /* Insert the new directory entry into the appropriate directory. */ if (cnp->cn_flags & ISWHITEOUT) tmpfs_dir_whiteout_remove(dvp, cnp); tmpfs_dir_attach(dvp, de); /* vp link count has changed, so update node times. */ node->tn_status |= TMPFS_NODE_CHANGED; tmpfs_update(vp); error = 0; out: return (error); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int tmpfs_rename_relock(struct vnode *fdvp, struct vnode **fvpp, struct vnode *tdvp, struct vnode **tvpp, struct componentname *fcnp, struct componentname *tcnp) { struct vnode *nvp; struct mount *mp; struct tmpfs_dirent *de; int error, restarts = 0; VOP_UNLOCK(tdvp); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK(*tvpp); mp = fdvp->v_mount; relock: restarts += 1; error = vn_lock(fdvp, LK_EXCLUSIVE); if (error) goto releout; if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(fdvp); error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto releout; VOP_UNLOCK(tdvp); goto relock; } /* * Re-resolve fvp to be certain it still exists and fetch the * correct vnode. */ de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(fdvp), NULL, fcnp); if (de == NULL) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); if ((fcnp->cn_flags & ISDOTDOT) != 0 || (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.')) error = EINVAL; else error = ENOENT; goto releout; } error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (error != 0) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); if (error != EBUSY) goto releout; error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; VOP_UNLOCK(nvp); /* * Concurrent rename race. */ if (nvp == tdvp) { vrele(nvp); error = EINVAL; goto releout; } vrele(*fvpp); *fvpp = nvp; goto relock; } vrele(*fvpp); *fvpp = nvp; VOP_UNLOCK(*fvpp); /* * Re-resolve tvp and acquire the vnode lock if present. */ de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(tdvp), NULL, tcnp); /* * If tvp disappeared we just carry on. */ if (de == NULL && *tvpp != NULL) { vrele(*tvpp); *tvpp = NULL; } /* * Get the tvp ino if the lookup succeeded. We may have to restart * if the non-blocking acquire fails. */ if (de != NULL) { nvp = NULL; error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (*tvpp != NULL) vrele(*tvpp); *tvpp = nvp; if (error != 0) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); if (error != EBUSY) goto releout; error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; VOP_UNLOCK(nvp); /* * fdvp contains fvp, thus tvp (=fdvp) is not empty. */ if (nvp == fdvp) { error = ENOTEMPTY; goto releout; } goto relock; } } tmpfs_rename_restarts += restarts; return (0); releout: vrele(fdvp); vrele(*fvpp); vrele(tdvp); if (*tvpp != NULL) vrele(*tvpp); tmpfs_rename_restarts += restarts; return (error); } static int tmpfs_rename(struct vop_rename_args *v) { struct vnode *fdvp = v->a_fdvp; struct vnode *fvp = v->a_fvp; struct componentname *fcnp = v->a_fcnp; struct vnode *tdvp = v->a_tdvp; struct vnode *tvp = v->a_tvp; struct componentname *tcnp = v->a_tcnp; char *newname; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *fdnode; struct tmpfs_node *fnode; struct tmpfs_node *tnode; struct tmpfs_node *tdnode; int error; bool want_seqc_end; MPASS(VOP_ISLOCKED(tdvp)); MPASS(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp))); - MPASS(fcnp->cn_flags & HASBUF); - MPASS(tcnp->cn_flags & HASBUF); want_seqc_end = false; /* * Disallow cross-device renames. * XXX Why isn't this done by the caller? */ if (fvp->v_mount != tdvp->v_mount || (tvp != NULL && fvp->v_mount != tvp->v_mount)) { error = EXDEV; goto out; } /* If source and target are the same file, there is nothing to do. */ if (fvp == tvp) { error = 0; goto out; } /* * If we need to move the directory between entries, lock the * source so that we can safely operate on it. */ if (fdvp != tdvp && fdvp != tvp) { if (vn_lock(fdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { error = tmpfs_rename_relock(fdvp, &fvp, tdvp, &tvp, fcnp, tcnp); if (error != 0) return (error); ASSERT_VOP_ELOCKED(fdvp, "tmpfs_rename: fdvp not locked"); ASSERT_VOP_ELOCKED(tdvp, "tmpfs_rename: tdvp not locked"); if (tvp != NULL) ASSERT_VOP_ELOCKED(tvp, "tmpfs_rename: tvp not locked"); if (fvp == tvp) { error = 0; goto out_locked; } } } if (tvp != NULL) vn_seqc_write_begin(tvp); vn_seqc_write_begin(tdvp); vn_seqc_write_begin(fvp); vn_seqc_write_begin(fdvp); want_seqc_end = true; tmp = VFS_TO_TMPFS(tdvp->v_mount); tdnode = VP_TO_TMPFS_DIR(tdvp); tnode = (tvp == NULL) ? NULL : VP_TO_TMPFS_NODE(tvp); fdnode = VP_TO_TMPFS_DIR(fdvp); fnode = VP_TO_TMPFS_NODE(fvp); de = tmpfs_dir_lookup(fdnode, fnode, fcnp); /* * Entry can disappear before we lock fdvp, * also avoid manipulating '.' and '..' entries. */ if (de == NULL) { if ((fcnp->cn_flags & ISDOTDOT) != 0 || (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.')) error = EINVAL; else error = ENOENT; goto out_locked; } MPASS(de->td_node == fnode); /* * If re-naming a directory to another preexisting directory * ensure that the target directory is empty so that its * removal causes no side effects. * Kern_rename guarantees the destination to be a directory * if the source is one. */ if (tvp != NULL) { MPASS(tnode != NULL); if ((tnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (tdnode->tn_flags & (APPEND | IMMUTABLE))) { error = EPERM; goto out_locked; } if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) { if (tnode->tn_size > 0) { error = ENOTEMPTY; goto out_locked; } } else if (fnode->tn_type == VDIR && tnode->tn_type != VDIR) { error = ENOTDIR; goto out_locked; } else if (fnode->tn_type != VDIR && tnode->tn_type == VDIR) { error = EISDIR; goto out_locked; } else { MPASS(fnode->tn_type != VDIR && tnode->tn_type != VDIR); } } if ((fnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (fdnode->tn_flags & (APPEND | IMMUTABLE))) { error = EPERM; goto out_locked; } /* * Ensure that we have enough memory to hold the new name, if it * has to be changed. */ if (fcnp->cn_namelen != tcnp->cn_namelen || bcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen) != 0) { newname = malloc(tcnp->cn_namelen, M_TMPFSNAME, M_WAITOK); } else newname = NULL; /* * If the node is being moved to another directory, we have to do * the move. */ if (fdnode != tdnode) { /* * In case we are moving a directory, we have to adjust its * parent to point to the new parent. */ if (de->td_node->tn_type == VDIR) { struct tmpfs_node *n; /* * Ensure the target directory is not a child of the * directory being moved. Otherwise, we'd end up * with stale nodes. */ n = tdnode; /* * TMPFS_LOCK guaranties that no nodes are freed while * traversing the list. Nodes can only be marked as * removed: tn_parent == NULL. */ TMPFS_LOCK(tmp); TMPFS_NODE_LOCK(n); while (n != n->tn_dir.tn_parent) { struct tmpfs_node *parent; if (n == fnode) { TMPFS_NODE_UNLOCK(n); TMPFS_UNLOCK(tmp); error = EINVAL; if (newname != NULL) free(newname, M_TMPFSNAME); goto out_locked; } parent = n->tn_dir.tn_parent; TMPFS_NODE_UNLOCK(n); if (parent == NULL) { n = NULL; break; } TMPFS_NODE_LOCK(parent); if (parent->tn_dir.tn_parent == NULL) { TMPFS_NODE_UNLOCK(parent); n = NULL; break; } n = parent; } TMPFS_UNLOCK(tmp); if (n == NULL) { error = EINVAL; if (newname != NULL) free(newname, M_TMPFSNAME); goto out_locked; } TMPFS_NODE_UNLOCK(n); /* Adjust the parent pointer. */ TMPFS_VALIDATE_DIR(fnode); TMPFS_NODE_LOCK(de->td_node); de->td_node->tn_dir.tn_parent = tdnode; TMPFS_NODE_UNLOCK(de->td_node); /* * As a result of changing the target of the '..' * entry, the link count of the source and target * directories has to be adjusted. */ TMPFS_NODE_LOCK(tdnode); TMPFS_ASSERT_LOCKED(tdnode); tdnode->tn_links++; TMPFS_NODE_UNLOCK(tdnode); TMPFS_NODE_LOCK(fdnode); TMPFS_ASSERT_LOCKED(fdnode); fdnode->tn_links--; TMPFS_NODE_UNLOCK(fdnode); } } /* * Do the move: just remove the entry from the source directory * and insert it into the target one. */ tmpfs_dir_detach(fdvp, de); if (fcnp->cn_flags & DOWHITEOUT) tmpfs_dir_whiteout_add(fdvp, fcnp); if (tcnp->cn_flags & ISWHITEOUT) tmpfs_dir_whiteout_remove(tdvp, tcnp); /* * If the name has changed, we need to make it effective by changing * it in the directory entry. */ if (newname != NULL) { MPASS(tcnp->cn_namelen <= MAXNAMLEN); free(de->ud.td_name, M_TMPFSNAME); de->ud.td_name = newname; tmpfs_dirent_init(de, tcnp->cn_nameptr, tcnp->cn_namelen); fnode->tn_status |= TMPFS_NODE_CHANGED; tdnode->tn_status |= TMPFS_NODE_MODIFIED; } /* * If we are overwriting an entry, we have to remove the old one * from the target directory. */ if (tvp != NULL) { struct tmpfs_dirent *tde; /* Remove the old entry from the target directory. */ tde = tmpfs_dir_lookup(tdnode, tnode, tcnp); tmpfs_dir_detach(tdvp, tde); /* * Free the directory entry we just deleted. Note that the * node referred by it will not be removed until the vnode is * really reclaimed. */ tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), tde); } tmpfs_dir_attach(tdvp, de); if (tmpfs_use_nc(fvp)) { cache_vop_rename(fdvp, fvp, tdvp, tvp, fcnp, tcnp); } error = 0; out_locked: if (fdvp != tdvp && fdvp != tvp) VOP_UNLOCK(fdvp); out: if (want_seqc_end) { if (tvp != NULL) vn_seqc_write_end(tvp); vn_seqc_write_end(tdvp); vn_seqc_write_end(fvp); vn_seqc_write_end(fdvp); } /* * Release target nodes. * XXX: I don't understand when tdvp can be the same as tvp, but * other code takes care of this... */ if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp != NULL) vput(tvp); /* Release source nodes. */ vrele(fdvp); vrele(fvp); return (error); } static int tmpfs_mkdir(struct vop_mkdir_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; MPASS(vap->va_type == VDIR); return (tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL)); } static int tmpfs_rmdir(struct vop_rmdir_args *v) { struct vnode *dvp = v->a_dvp; struct vnode *vp = v->a_vp; int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp)); MPASS(VOP_ISLOCKED(vp)); tmp = VFS_TO_TMPFS(dvp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_DIR(vp); /* Directories with more than two entries ('.' and '..') cannot be * removed. */ if (node->tn_size > 0) { error = ENOTEMPTY; goto out; } if ((dnode->tn_flags & APPEND) || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } /* This invariant holds only if we are not trying to remove "..". * We checked for that above so this is safe now. */ MPASS(node->tn_dir.tn_parent == dnode); /* Get the directory entry associated with node (vp). This was * filled by tmpfs_lookup while looking up the entry. */ de = tmpfs_dir_lookup(dnode, node, v->a_cnp); MPASS(TMPFS_DIRENT_MATCHES(de, v->a_cnp->cn_nameptr, v->a_cnp->cn_namelen)); /* Check flags to see if we are allowed to remove the directory. */ if ((dnode->tn_flags & APPEND) != 0 || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) != 0) { error = EPERM; goto out; } /* Detach the directory entry from the directory (dnode). */ tmpfs_dir_detach(dvp, de); if (v->a_cnp->cn_flags & DOWHITEOUT) tmpfs_dir_whiteout_add(dvp, v->a_cnp); /* No vnode should be allocated for this entry from this point */ TMPFS_NODE_LOCK(node); node->tn_links--; node->tn_dir.tn_parent = NULL; node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; node->tn_accessed = true; TMPFS_NODE_UNLOCK(node); TMPFS_NODE_LOCK(dnode); dnode->tn_links--; dnode->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; dnode->tn_accessed = true; TMPFS_NODE_UNLOCK(dnode); if (tmpfs_use_nc(dvp)) { cache_vop_rmdir(dvp, vp); } /* Free the directory entry we just deleted. Note that the node * referred by it will not be removed until the vnode is really * reclaimed. */ tmpfs_free_dirent(tmp, de); /* Release the deleted vnode (will destroy the node, notify * interested parties and clean it from the cache). */ dnode->tn_status |= TMPFS_NODE_CHANGED; tmpfs_update(dvp); error = 0; out: return (error); } static int tmpfs_symlink(struct vop_symlink_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; const char *target = v->a_target; #ifdef notyet /* XXX FreeBSD BUG: kern_symlink is not setting VLNK */ MPASS(vap->va_type == VLNK); #else vap->va_type = VLNK; #endif return (tmpfs_alloc_file(dvp, vpp, vap, cnp, target)); } static int tmpfs_readdir(struct vop_readdir_args *va) { struct vnode *vp; struct uio *uio; struct tmpfs_mount *tm; struct tmpfs_node *node; uint64_t **cookies; int *eofflag, *ncookies; ssize_t startresid; int error, maxcookies; vp = va->a_vp; uio = va->a_uio; eofflag = va->a_eofflag; cookies = va->a_cookies; ncookies = va->a_ncookies; /* This operation only makes sense on directory nodes. */ if (vp->v_type != VDIR) return (ENOTDIR); maxcookies = 0; node = VP_TO_TMPFS_DIR(vp); tm = VFS_TO_TMPFS(vp->v_mount); startresid = uio->uio_resid; /* Allocate cookies for NFS and compat modules. */ if (cookies != NULL && ncookies != NULL) { maxcookies = howmany(node->tn_size, sizeof(struct tmpfs_dirent)) + 2; *cookies = malloc(maxcookies * sizeof(**cookies), M_TEMP, M_WAITOK); *ncookies = 0; } if (cookies == NULL) error = tmpfs_dir_getdents(tm, node, uio, 0, NULL, NULL); else error = tmpfs_dir_getdents(tm, node, uio, maxcookies, *cookies, ncookies); /* Buffer was filled without hitting EOF. */ if (error == EJUSTRETURN) error = (uio->uio_resid != startresid) ? 0 : EINVAL; if (error != 0 && cookies != NULL && ncookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } if (eofflag != NULL) *eofflag = (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF); return (error); } static int tmpfs_readlink(struct vop_readlink_args *v) { struct vnode *vp = v->a_vp; struct uio *uio = v->a_uio; int error; struct tmpfs_node *node; MPASS(uio->uio_offset == 0); MPASS(vp->v_type == VLNK); node = VP_TO_TMPFS_NODE(vp); error = uiomove(node->tn_link_target, MIN(node->tn_size, uio->uio_resid), uio); tmpfs_set_accessed(VFS_TO_TMPFS(vp->v_mount), node); return (error); } /* * VOP_FPLOOKUP_SYMLINK routines are subject to special circumstances, see * the comment above cache_fplookup for details. * * Check tmpfs_alloc_node for tmpfs-specific synchronisation notes. */ static int tmpfs_fplookup_symlink(struct vop_fplookup_symlink_args *v) { struct vnode *vp; struct tmpfs_node *node; char *symlink; vp = v->a_vp; node = VP_TO_TMPFS_NODE_SMR(vp); if (__predict_false(node == NULL)) return (EAGAIN); if (!atomic_load_char(&node->tn_link_smr)) return (EAGAIN); symlink = atomic_load_ptr(&node->tn_link_target); if (symlink == NULL) return (EAGAIN); return (cache_symlink_resolve(v->a_fpl, symlink, node->tn_size)); } static int tmpfs_inactive(struct vop_inactive_args *v) { struct vnode *vp; struct tmpfs_node *node; vp = v->a_vp; node = VP_TO_TMPFS_NODE(vp); if (node->tn_links == 0) vrecycle(vp); else tmpfs_check_mtime(vp); return (0); } static int tmpfs_need_inactive(struct vop_need_inactive_args *ap) { struct vnode *vp; struct tmpfs_node *node; struct vm_object *obj; vp = ap->a_vp; node = VP_TO_TMPFS_NODE(vp); if (node->tn_links == 0) goto need; if (vp->v_type == VREG) { obj = vp->v_object; if (obj->generation != obj->cleangeneration) goto need; } return (0); need: return (1); } int tmpfs_reclaim(struct vop_reclaim_args *v) { struct vnode *vp; struct tmpfs_mount *tmp; struct tmpfs_node *node; bool unlock; vp = v->a_vp; node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); if (vp->v_type == VREG) tmpfs_destroy_vobject(vp, node->tn_reg.tn_aobj); vp->v_object = NULL; TMPFS_LOCK(tmp); TMPFS_NODE_LOCK(node); tmpfs_free_vp(vp); /* * If the node referenced by this vnode was deleted by the user, * we must free its associated data structures (now that the vnode * is being reclaimed). */ unlock = true; if (node->tn_links == 0 && (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0) { node->tn_vpstate = TMPFS_VNODE_DOOMED; unlock = !tmpfs_free_node_locked(tmp, node, true); } if (unlock) { TMPFS_NODE_UNLOCK(node); TMPFS_UNLOCK(tmp); } MPASS(vp->v_data == NULL); return (0); } int tmpfs_print(struct vop_print_args *v) { struct vnode *vp = v->a_vp; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); printf("tag VT_TMPFS, tmpfs_node %p, flags 0x%lx, links %jd\n", node, node->tn_flags, (uintmax_t)node->tn_links); printf("\tmode 0%o, owner %d, group %d, size %jd, status 0x%x\n", node->tn_mode, node->tn_uid, node->tn_gid, (intmax_t)node->tn_size, node->tn_status); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } int tmpfs_pathconf(struct vop_pathconf_args *v) { struct vnode *vp = v->a_vp; int name = v->a_name; long *retval = v->a_retval; int error; error = 0; switch (name) { case _PC_LINK_MAX: *retval = TMPFS_LINK_MAX; break; case _PC_SYMLINK_MAX: *retval = MAXPATHLEN; break; case _PC_NAME_MAX: *retval = NAME_MAX; break; case _PC_PIPE_BUF: if (vp->v_type == VDIR || vp->v_type == VFIFO) *retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *retval = 1; break; case _PC_NO_TRUNC: *retval = 1; break; case _PC_SYNC_IO: *retval = 1; break; case _PC_FILESIZEBITS: *retval = 64; break; default: error = vop_stdpathconf(v); } return (error); } static int tmpfs_vptofh(struct vop_vptofh_args *ap) /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ { struct tmpfs_fid_data tfd; struct tmpfs_node *node; struct fid *fhp; node = VP_TO_TMPFS_NODE(ap->a_vp); fhp = ap->a_fhp; fhp->fid_len = sizeof(tfd); /* * Copy into fid_data from the stack to avoid unaligned pointer use. * See the comment in sys/mount.h on struct fid for details. */ tfd.tfd_id = node->tn_id; tfd.tfd_gen = node->tn_gen; memcpy(fhp->fid_data, &tfd, fhp->fid_len); return (0); } static int tmpfs_whiteout(struct vop_whiteout_args *ap) { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct tmpfs_dirent *de; switch (ap->a_flags) { case LOOKUP: return (0); case CREATE: de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); if (de != NULL) return (de->td_node == NULL ? 0 : EEXIST); return (tmpfs_dir_whiteout_add(dvp, cnp)); case DELETE: tmpfs_dir_whiteout_remove(dvp, cnp); return (0); default: panic("tmpfs_whiteout: unknown op"); } } static int tmpfs_vptocnp_dir(struct tmpfs_node *tn, struct tmpfs_node *tnp, struct tmpfs_dirent **pde) { struct tmpfs_dir_cursor dc; struct tmpfs_dirent *de; for (de = tmpfs_dir_first(tnp, &dc); de != NULL; de = tmpfs_dir_next(tnp, &dc)) { if (de->td_node == tn) { *pde = de; return (0); } } return (ENOENT); } static int tmpfs_vptocnp_fill(struct vnode *vp, struct tmpfs_node *tn, struct tmpfs_node *tnp, char *buf, size_t *buflen, struct vnode **dvp) { struct tmpfs_dirent *de; int error, i; error = vn_vget_ino_gen(vp, tmpfs_vn_get_ino_alloc, tnp, LK_SHARED, dvp); if (error != 0) return (error); error = tmpfs_vptocnp_dir(tn, tnp, &de); if (error == 0) { i = *buflen; i -= de->td_namelen; if (i < 0) { error = ENOMEM; } else { bcopy(de->ud.td_name, buf + i, de->td_namelen); *buflen = i; } } if (error == 0) { if (vp != *dvp) VOP_UNLOCK(*dvp); } else { if (vp != *dvp) vput(*dvp); else vrele(vp); } return (error); } static int tmpfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp, **dvp; struct tmpfs_node *tn, *tnp, *tnp1; struct tmpfs_dirent *de; struct tmpfs_mount *tm; char *buf; size_t *buflen; int error; vp = ap->a_vp; dvp = ap->a_vpp; buf = ap->a_buf; buflen = ap->a_buflen; tm = VFS_TO_TMPFS(vp->v_mount); tn = VP_TO_TMPFS_NODE(vp); if (tn->tn_type == VDIR) { tnp = tn->tn_dir.tn_parent; if (tnp == NULL) return (ENOENT); tmpfs_ref_node(tnp); error = tmpfs_vptocnp_fill(vp, tn, tn->tn_dir.tn_parent, buf, buflen, dvp); tmpfs_free_node(tm, tnp); return (error); } restart: TMPFS_LOCK(tm); restart_locked: LIST_FOREACH_SAFE(tnp, &tm->tm_nodes_used, tn_entries, tnp1) { if (tnp->tn_type != VDIR) continue; TMPFS_NODE_LOCK(tnp); tmpfs_ref_node(tnp); /* * tn_vnode cannot be instantiated while we hold the * node lock, so the directory cannot be changed while * we iterate over it. Do this to avoid instantiating * vnode for directories which cannot point to our * node. */ error = tnp->tn_vnode == NULL ? tmpfs_vptocnp_dir(tn, tnp, &de) : 0; if (error == 0) { TMPFS_NODE_UNLOCK(tnp); TMPFS_UNLOCK(tm); error = tmpfs_vptocnp_fill(vp, tn, tnp, buf, buflen, dvp); if (error == 0) { tmpfs_free_node(tm, tnp); return (0); } if (VN_IS_DOOMED(vp)) { tmpfs_free_node(tm, tnp); return (ENOENT); } TMPFS_LOCK(tm); TMPFS_NODE_LOCK(tnp); } if (tmpfs_free_node_locked(tm, tnp, false)) { goto restart; } else { KASSERT(tnp->tn_refcount > 0, ("node %p refcount zero", tnp)); if (tnp->tn_attached) { tnp1 = LIST_NEXT(tnp, tn_entries); TMPFS_NODE_UNLOCK(tnp); } else { TMPFS_NODE_UNLOCK(tnp); goto restart_locked; } } } TMPFS_UNLOCK(tm); return (ENOENT); } /* * Vnode operations vector used for files stored in a tmpfs file system. */ struct vop_vector tmpfs_vnodeop_entries = { .vop_default = &default_vnodeops, .vop_lookup = vfs_cache_lookup, .vop_cachedlookup = tmpfs_cached_lookup, .vop_create = tmpfs_create, .vop_mknod = tmpfs_mknod, .vop_open = tmpfs_open, .vop_close = tmpfs_close, .vop_fplookup_vexec = tmpfs_fplookup_vexec, .vop_fplookup_symlink = tmpfs_fplookup_symlink, .vop_access = tmpfs_access, .vop_stat = tmpfs_stat, .vop_getattr = tmpfs_getattr, .vop_setattr = tmpfs_setattr, .vop_read = tmpfs_read, .vop_read_pgcache = tmpfs_read_pgcache, .vop_write = tmpfs_write, .vop_deallocate = tmpfs_deallocate, .vop_fsync = tmpfs_fsync, .vop_remove = tmpfs_remove, .vop_link = tmpfs_link, .vop_rename = tmpfs_rename, .vop_mkdir = tmpfs_mkdir, .vop_rmdir = tmpfs_rmdir, .vop_symlink = tmpfs_symlink, .vop_readdir = tmpfs_readdir, .vop_readlink = tmpfs_readlink, .vop_inactive = tmpfs_inactive, .vop_need_inactive = tmpfs_need_inactive, .vop_reclaim = tmpfs_reclaim, .vop_print = tmpfs_print, .vop_pathconf = tmpfs_pathconf, .vop_vptofh = tmpfs_vptofh, .vop_whiteout = tmpfs_whiteout, .vop_bmap = VOP_EOPNOTSUPP, .vop_vptocnp = tmpfs_vptocnp, .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(tmpfs_vnodeop_entries); /* * Same vector for mounts which do not use namecache. */ struct vop_vector tmpfs_vnodeop_nonc_entries = { .vop_default = &tmpfs_vnodeop_entries, .vop_lookup = tmpfs_lookup, }; VFS_VOP_VECTOR_REGISTER(tmpfs_vnodeop_nonc_entries); diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c index a3a5feef3291..ac00cc1bc093 100644 --- a/sys/fs/unionfs/union_subr.c +++ b/sys/fs/unionfs/union_subr.c @@ -1,1336 +1,1319 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1994 Jan-Simon Pendry * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * Copyright (c) 2005, 2006, 2012 Masanori Ozawa , ONGS Inc. * Copyright (c) 2006, 2012 Daichi Goto * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NUNIONFSNODECACHE 16 #define UNIONFSHASHMASK (NUNIONFSNODECACHE - 1) static MALLOC_DEFINE(M_UNIONFSHASH, "UNIONFS hash", "UNIONFS hash table"); MALLOC_DEFINE(M_UNIONFSNODE, "UNIONFS node", "UNIONFS vnode private part"); MALLOC_DEFINE(M_UNIONFSPATH, "UNIONFS path", "UNIONFS path private part"); static struct task unionfs_deferred_rele_task; static struct mtx unionfs_deferred_rele_lock; static STAILQ_HEAD(, unionfs_node) unionfs_deferred_rele_list = STAILQ_HEAD_INITIALIZER(unionfs_deferred_rele_list); static TASKQUEUE_DEFINE_THREAD(unionfs_rele); unsigned int unionfs_ndeferred = 0; SYSCTL_UINT(_vfs, OID_AUTO, unionfs_ndeferred, CTLFLAG_RD, &unionfs_ndeferred, 0, "unionfs deferred vnode release"); static void unionfs_deferred_rele(void *, int); /* * Initialize */ int unionfs_init(struct vfsconf *vfsp) { UNIONFSDEBUG("unionfs_init\n"); /* printed during system boot */ TASK_INIT(&unionfs_deferred_rele_task, 0, unionfs_deferred_rele, NULL); mtx_init(&unionfs_deferred_rele_lock, "uniondefr", NULL, MTX_DEF); return (0); } /* * Uninitialize */ int unionfs_uninit(struct vfsconf *vfsp) { taskqueue_quiesce(taskqueue_unionfs_rele); taskqueue_free(taskqueue_unionfs_rele); mtx_destroy(&unionfs_deferred_rele_lock); return (0); } static void unionfs_deferred_rele(void *arg __unused, int pending __unused) { STAILQ_HEAD(, unionfs_node) local_rele_list; struct unionfs_node *unp, *tunp; unsigned int ndeferred; ndeferred = 0; STAILQ_INIT(&local_rele_list); mtx_lock(&unionfs_deferred_rele_lock); STAILQ_CONCAT(&local_rele_list, &unionfs_deferred_rele_list); mtx_unlock(&unionfs_deferred_rele_lock); STAILQ_FOREACH_SAFE(unp, &local_rele_list, un_rele, tunp) { ++ndeferred; MPASS(unp->un_dvp != NULL); vrele(unp->un_dvp); free(unp, M_UNIONFSNODE); } /* We expect this function to be single-threaded, thus no atomic */ unionfs_ndeferred += ndeferred; } static struct unionfs_node_hashhead * unionfs_get_hashhead(struct vnode *dvp, struct vnode *lookup) { struct unionfs_node *unp; unp = VTOUNIONFS(dvp); return (&(unp->un_hashtbl[vfs_hash_index(lookup) & UNIONFSHASHMASK])); } /* * Attempt to lookup a cached unionfs vnode by upper/lower vp * from dvp, with dvp's interlock held. */ static struct vnode * unionfs_get_cached_vnode_locked(struct vnode *lookup, struct vnode *dvp) { struct unionfs_node *unp; struct unionfs_node_hashhead *hd; struct vnode *vp; hd = unionfs_get_hashhead(dvp, lookup); LIST_FOREACH(unp, hd, un_hash) { if (unp->un_uppervp == lookup || unp->un_lowervp == lookup) { vp = UNIONFSTOV(unp); VI_LOCK_FLAGS(vp, MTX_DUPOK); vp->v_iflag &= ~VI_OWEINACT; if (VN_IS_DOOMED(vp) || ((vp->v_iflag & VI_DOINGINACT) != 0)) { VI_UNLOCK(vp); vp = NULLVP; } else { vrefl(vp); VI_UNLOCK(vp); } return (vp); } } return (NULLVP); } /* * Get the cached vnode. */ static struct vnode * unionfs_get_cached_vnode(struct vnode *uvp, struct vnode *lvp, struct vnode *dvp) { struct vnode *vp; vp = NULLVP; VI_LOCK(dvp); if (uvp != NULLVP) vp = unionfs_get_cached_vnode_locked(uvp, dvp); else if (lvp != NULLVP) vp = unionfs_get_cached_vnode_locked(lvp, dvp); VI_UNLOCK(dvp); return (vp); } /* * Add the new vnode into cache. */ static struct vnode * unionfs_ins_cached_vnode(struct unionfs_node *uncp, struct vnode *dvp) { struct unionfs_node_hashhead *hd; struct vnode *vp; ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__); ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__); KASSERT(uncp->un_uppervp == NULLVP || uncp->un_uppervp->v_type == VDIR, ("%s: v_type != VDIR", __func__)); KASSERT(uncp->un_lowervp == NULLVP || uncp->un_lowervp->v_type == VDIR, ("%s: v_type != VDIR", __func__)); vp = NULLVP; VI_LOCK(dvp); if (uncp->un_uppervp != NULL) vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp); else if (uncp->un_lowervp != NULL) vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp); if (vp == NULLVP) { hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULLVP ? uncp->un_uppervp : uncp->un_lowervp)); LIST_INSERT_HEAD(hd, uncp, un_hash); } VI_UNLOCK(dvp); return (vp); } /* * Remove the vnode. */ static void unionfs_rem_cached_vnode(struct unionfs_node *unp, struct vnode *dvp) { KASSERT(unp != NULL, ("%s: null node", __func__)); KASSERT(dvp != NULLVP, ("%s: null parent vnode", __func__)); VI_LOCK(dvp); if (unp->un_hash.le_prev != NULL) { LIST_REMOVE(unp, un_hash); unp->un_hash.le_next = NULL; unp->un_hash.le_prev = NULL; } VI_UNLOCK(dvp); } /* * Common cleanup handling for unionfs_nodeget * Upper, lower, and parent directory vnodes are expected to be referenced by * the caller. Upper and lower vnodes, if non-NULL, are also expected to be * exclusively locked by the caller. * This function will return with the caller's locks and references undone. */ static void unionfs_nodeget_cleanup(struct vnode *vp, struct unionfs_node *unp) { /* * Lock and reset the default vnode lock; vgone() expects a locked * vnode, and we're going to reset the vnode ops. */ lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); /* * Clear out private data and reset the vnode ops to avoid use of * unionfs vnode ops on a partially constructed vnode. */ VI_LOCK(vp); vp->v_data = NULL; vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; VI_UNLOCK(vp); vgone(vp); vput(vp); if (unp->un_dvp != NULLVP) vrele(unp->un_dvp); if (unp->un_uppervp != NULLVP) vput(unp->un_uppervp); if (unp->un_lowervp != NULLVP) vput(unp->un_lowervp); if (unp->un_hashtbl != NULL) hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK); free(unp->un_path, M_UNIONFSPATH); free(unp, M_UNIONFSNODE); } /* * Make a new or get existing unionfs node. * * uppervp and lowervp should be unlocked. Because if new unionfs vnode is * locked, uppervp or lowervp is locked too. In order to prevent dead lock, * you should not lock plurality simultaneously. */ int unionfs_nodeget(struct mount *mp, struct vnode *uppervp, struct vnode *lowervp, struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { char *path; struct unionfs_mount *ump; struct unionfs_node *unp; struct vnode *vp; u_long hashmask; int error; int lkflags; enum vtype vt; error = 0; ump = MOUNTTOUNIONFSMOUNT(mp); lkflags = (cnp ? cnp->cn_lkflags : 0); path = (cnp ? cnp->cn_nameptr : NULL); *vpp = NULLVP; if (uppervp == NULLVP && lowervp == NULLVP) panic("%s: upper and lower is null", __func__); vt = (uppervp != NULLVP ? uppervp->v_type : lowervp->v_type); /* If it has no ISLASTCN flag, path check is skipped. */ if (cnp && !(cnp->cn_flags & ISLASTCN)) path = NULL; /* check the cache */ if (dvp != NULLVP && vt == VDIR) { vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp); if (vp != NULLVP) { *vpp = vp; goto unionfs_nodeget_out; } } unp = malloc(sizeof(struct unionfs_node), M_UNIONFSNODE, M_WAITOK | M_ZERO); error = getnewvnode("unionfs", mp, &unionfs_vnodeops, &vp); if (error != 0) { free(unp, M_UNIONFSNODE); return (error); } if (dvp != NULLVP) vref(dvp); if (uppervp != NULLVP) vref(uppervp); if (lowervp != NULLVP) vref(lowervp); if (vt == VDIR) { unp->un_hashtbl = hashinit(NUNIONFSNODECACHE, M_UNIONFSHASH, &hashmask); KASSERT(hashmask == UNIONFSHASHMASK, ("unexpected unionfs hash mask 0x%lx", hashmask)); } unp->un_vnode = vp; unp->un_uppervp = uppervp; unp->un_lowervp = lowervp; unp->un_dvp = dvp; if (uppervp != NULLVP) vp->v_vnlock = uppervp->v_vnlock; else vp->v_vnlock = lowervp->v_vnlock; if (path != NULL) { unp->un_path = malloc(cnp->cn_namelen + 1, M_UNIONFSPATH, M_WAITOK | M_ZERO); bcopy(cnp->cn_nameptr, unp->un_path, cnp->cn_namelen); unp->un_path[cnp->cn_namelen] = '\0'; unp->un_pathlen = cnp->cn_namelen; } vp->v_type = vt; vp->v_data = unp; /* * TODO: This is an imperfect check, as there's no guarantee that * the underlying filesystems will always return vnode pointers * for the root inodes that match our cached values. To reduce * the likelihood of failure, for example in the case where either * vnode has been forcibly doomed, we check both pointers and set * VV_ROOT if either matches. */ if (ump->um_uppervp == uppervp || ump->um_lowervp == lowervp) vp->v_vflag |= VV_ROOT; KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0, ("%s: NULL dvp for non-root vp %p", __func__, vp)); vn_lock_pair(lowervp, false, uppervp, false); error = insmntque1(vp, mp); if (error != 0) { unionfs_nodeget_cleanup(vp, unp); return (error); } if (lowervp != NULL && VN_IS_DOOMED(lowervp)) { vput(lowervp); unp->un_lowervp = NULL; } if (uppervp != NULL && VN_IS_DOOMED(uppervp)) { vput(uppervp); unp->un_uppervp = NULL; } if (unp->un_lowervp == NULL && unp->un_uppervp == NULL) { unionfs_nodeget_cleanup(vp, unp); return (ENOENT); } if (dvp != NULLVP && vt == VDIR) *vpp = unionfs_ins_cached_vnode(unp, dvp); if (*vpp != NULLVP) { unionfs_nodeget_cleanup(vp, unp); vp = *vpp; } else { if (uppervp != NULL) VOP_UNLOCK(uppervp); if (lowervp != NULL) VOP_UNLOCK(lowervp); *vpp = vp; } unionfs_nodeget_out: if (lkflags & LK_TYPE_MASK) vn_lock(vp, lkflags | LK_RETRY); return (0); } /* * Clean up the unionfs node. */ void unionfs_noderem(struct vnode *vp) { struct unionfs_node *unp, *unp_t1, *unp_t2; struct unionfs_node_hashhead *hd; struct unionfs_node_status *unsp, *unsp_tmp; struct vnode *lvp; struct vnode *uvp; struct vnode *dvp; int count; int writerefs; /* * The root vnode lock may be recursed during unmount, because * it may share the same lock as the unionfs mount's covered vnode, * which is locked across VFS_UNMOUNT(). This lock will then be * recursively taken during the vflush() issued by unionfs_unmount(). * But we still only need to lock the unionfs lock once, because only * one of those lock operations was taken against a unionfs vnode and * will be undone against a unionfs vnode. */ KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0, ("%s: vnode %p locked recursively", __func__, vp)); if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("%s: failed to acquire lock for vnode lock", __func__); /* * Use the interlock to protect the clearing of v_data to * prevent faults in unionfs_lock(). */ VI_LOCK(vp); unp = VTOUNIONFS(vp); lvp = unp->un_lowervp; uvp = unp->un_uppervp; dvp = unp->un_dvp; unp->un_lowervp = unp->un_uppervp = NULLVP; vp->v_vnlock = &(vp->v_lock); vp->v_data = NULL; vp->v_object = NULL; if (unp->un_hashtbl != NULL) { /* * Clear out any cached child vnodes. This should only * be necessary during forced unmount, when the vnode may * be reclaimed with a non-zero use count. Otherwise the * reference held by each child should prevent reclamation. */ for (count = 0; count <= UNIONFSHASHMASK; count++) { hd = unp->un_hashtbl + count; LIST_FOREACH_SAFE(unp_t1, hd, un_hash, unp_t2) { LIST_REMOVE(unp_t1, un_hash); unp_t1->un_hash.le_next = NULL; unp_t1->un_hash.le_prev = NULL; } } } VI_UNLOCK(vp); writerefs = atomic_load_int(&vp->v_writecount); VNASSERT(writerefs >= 0, vp, ("%s: write count %d, unexpected text ref", __func__, writerefs)); /* * If we were opened for write, we leased the write reference * to the lower vnode. If this is a reclamation due to the * forced unmount, undo the reference now. */ if (writerefs > 0) { VNASSERT(uvp != NULL, vp, ("%s: write reference without upper vnode", __func__)); VOP_ADD_WRITECOUNT(uvp, -writerefs); } if (lvp != NULLVP) VOP_UNLOCK(lvp); if (uvp != NULLVP) VOP_UNLOCK(uvp); if (dvp != NULLVP) unionfs_rem_cached_vnode(unp, dvp); if (lvp != NULLVP) vrele(lvp); if (uvp != NULLVP) vrele(uvp); if (unp->un_path != NULL) { free(unp->un_path, M_UNIONFSPATH); unp->un_path = NULL; unp->un_pathlen = 0; } if (unp->un_hashtbl != NULL) { hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK); } LIST_FOREACH_SAFE(unsp, &(unp->un_unshead), uns_list, unsp_tmp) { LIST_REMOVE(unsp, uns_list); free(unsp, M_TEMP); } if (dvp != NULLVP) { mtx_lock(&unionfs_deferred_rele_lock); STAILQ_INSERT_TAIL(&unionfs_deferred_rele_list, unp, un_rele); mtx_unlock(&unionfs_deferred_rele_lock); taskqueue_enqueue(taskqueue_unionfs_rele, &unionfs_deferred_rele_task); } else free(unp, M_UNIONFSNODE); } /* * Get the unionfs node status object for the vnode corresponding to unp, * for the process that owns td. Allocate a new status object if one * does not already exist. */ void unionfs_get_node_status(struct unionfs_node *unp, struct thread *td, struct unionfs_node_status **unspp) { struct unionfs_node_status *unsp; pid_t pid; pid = td->td_proc->p_pid; KASSERT(NULL != unspp, ("%s: NULL status", __func__)); ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) { if (unsp->uns_pid == pid) { *unspp = unsp; return; } } /* create a new unionfs node status */ unsp = malloc(sizeof(struct unionfs_node_status), M_TEMP, M_WAITOK | M_ZERO); unsp->uns_pid = pid; LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list); *unspp = unsp; } /* * Remove the unionfs node status, if you can. * You need exclusive lock this vnode. */ void unionfs_tryrem_node_status(struct unionfs_node *unp, struct unionfs_node_status *unsp) { KASSERT(NULL != unsp, ("%s: NULL status", __func__)); ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); if (0 < unsp->uns_lower_opencnt || 0 < unsp->uns_upper_opencnt) return; LIST_REMOVE(unsp, uns_list); free(unsp, M_TEMP); } /* * Create upper node attr. */ void unionfs_create_uppervattr_core(struct unionfs_mount *ump, struct vattr *lva, struct vattr *uva, struct thread *td) { VATTR_NULL(uva); uva->va_type = lva->va_type; uva->va_atime = lva->va_atime; uva->va_mtime = lva->va_mtime; uva->va_ctime = lva->va_ctime; switch (ump->um_copymode) { case UNIONFS_TRANSPARENT: uva->va_mode = lva->va_mode; uva->va_uid = lva->va_uid; uva->va_gid = lva->va_gid; break; case UNIONFS_MASQUERADE: if (ump->um_uid == lva->va_uid) { uva->va_mode = lva->va_mode & 077077; uva->va_mode |= (lva->va_type == VDIR ? ump->um_udir : ump->um_ufile) & 0700; uva->va_uid = lva->va_uid; uva->va_gid = lva->va_gid; } else { uva->va_mode = (lva->va_type == VDIR ? ump->um_udir : ump->um_ufile); uva->va_uid = ump->um_uid; uva->va_gid = ump->um_gid; } break; default: /* UNIONFS_TRADITIONAL */ uva->va_mode = 0777 & ~td->td_proc->p_pd->pd_cmask; uva->va_uid = ump->um_uid; uva->va_gid = ump->um_gid; break; } } /* * Create upper node attr. */ int unionfs_create_uppervattr(struct unionfs_mount *ump, struct vnode *lvp, struct vattr *uva, struct ucred *cred, struct thread *td) { struct vattr lva; int error; if ((error = VOP_GETATTR(lvp, &lva, cred))) return (error); unionfs_create_uppervattr_core(ump, &lva, uva, td); return (error); } /* * relookup * * dvp should be locked on entry and will be locked on return. * * If an error is returned, *vpp will be invalid, otherwise it will hold a * locked, referenced vnode. If *vpp == dvp then remember that only one * LK_EXCLUSIVE lock is held. */ int unionfs_relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct componentname *cn, struct thread *td, char *path, int pathlen, u_long nameiop) { int error; cn->cn_namelen = pathlen; cn->cn_pnbuf = path; cn->cn_nameiop = nameiop; - cn->cn_flags = (LOCKPARENT | LOCKLEAF | HASBUF | SAVENAME | ISLASTCN); + cn->cn_flags = (LOCKPARENT | LOCKLEAF | ISLASTCN); cn->cn_lkflags = LK_EXCLUSIVE; cn->cn_cred = cnp->cn_cred; cn->cn_nameptr = cn->cn_pnbuf; if (nameiop == DELETE) cn->cn_flags |= (cnp->cn_flags & (DOWHITEOUT | SAVESTART)); else if (RENAME == nameiop) cn->cn_flags |= (cnp->cn_flags & SAVESTART); else if (nameiop == CREATE) cn->cn_flags |= NOCACHE; vref(dvp); VOP_UNLOCK(dvp); if ((error = vfs_relookup(dvp, vpp, cn))) { vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } else vrele(dvp); - KASSERT((cn->cn_flags & HASBUF) != 0, - ("%s: HASBUF cleared", __func__)); - KASSERT((cn->cn_flags & SAVENAME) != 0, - ("%s: SAVENAME cleared", __func__)); KASSERT(cn->cn_pnbuf == path, ("%s: cn_pnbuf changed", __func__)); return (error); } /* * relookup for CREATE namei operation. * * dvp is unionfs vnode. dvp should be locked. * * If it called 'unionfs_copyfile' function by unionfs_link etc, * VOP_LOOKUP information is broken. * So it need relookup in order to create link etc. */ int unionfs_relookup_for_create(struct vnode *dvp, struct componentname *cnp, struct thread *td) { struct vnode *udvp; struct vnode *vp; struct componentname cn; int error; udvp = UNIONFSVPTOUPPERVP(dvp); vp = NULLVP; - KASSERT((cnp->cn_flags & HASBUF) != 0, - ("%s called without HASBUF", __func__)); error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr, cnp->cn_namelen, CREATE); if (error) return (error); if (vp != NULLVP) { if (udvp == vp) vrele(vp); else vput(vp); error = EEXIST; } return (error); } /* * relookup for DELETE namei operation. * * dvp is unionfs vnode. dvp should be locked. */ int unionfs_relookup_for_delete(struct vnode *dvp, struct componentname *cnp, struct thread *td) { struct vnode *udvp; struct vnode *vp; struct componentname cn; int error; udvp = UNIONFSVPTOUPPERVP(dvp); vp = NULLVP; - KASSERT((cnp->cn_flags & HASBUF) != 0, - ("%s called without HASBUF", __func__)); error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr, cnp->cn_namelen, DELETE); if (error) return (error); if (vp == NULLVP) error = ENOENT; else { if (udvp == vp) vrele(vp); else vput(vp); } return (error); } /* * relookup for RENAME namei operation. * * dvp is unionfs vnode. dvp should be locked. */ int unionfs_relookup_for_rename(struct vnode *dvp, struct componentname *cnp, struct thread *td) { struct vnode *udvp; struct vnode *vp; struct componentname cn; int error; udvp = UNIONFSVPTOUPPERVP(dvp); vp = NULLVP; - KASSERT((cnp->cn_flags & HASBUF) != 0, - ("%s called without HASBUF", __func__)); error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr, cnp->cn_namelen, RENAME); if (error) return (error); if (vp != NULLVP) { if (udvp == vp) vrele(vp); else vput(vp); } return (error); } /* * Update the unionfs_node. * * uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the * uvp's lock and lower's lock will be unlocked. */ static void unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp, struct thread *td) { struct unionfs_node_hashhead *hd; struct vnode *vp; struct vnode *lvp; struct vnode *dvp; unsigned count, lockrec; vp = UNIONFSTOV(unp); lvp = unp->un_lowervp; ASSERT_VOP_ELOCKED(lvp, __func__); ASSERT_VOP_ELOCKED(uvp, __func__); dvp = unp->un_dvp; VNASSERT(vp->v_writecount == 0, vp, ("%s: non-zero writecount", __func__)); /* * Update the upper vnode's lock state to match the lower vnode, * and then switch the unionfs vnode's lock to the upper vnode. */ lockrec = lvp->v_vnlock->lk_recurse; for (count = 0; count < lockrec; count++) vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY); VI_LOCK(vp); unp->un_uppervp = uvp; vp->v_vnlock = uvp->v_vnlock; VI_UNLOCK(vp); /* * Re-cache the unionfs vnode against the upper vnode */ if (dvp != NULLVP && vp->v_type == VDIR) { VI_LOCK(dvp); if (unp->un_hash.le_prev != NULL) { LIST_REMOVE(unp, un_hash); hd = unionfs_get_hashhead(dvp, uvp); LIST_INSERT_HEAD(hd, unp, un_hash); } VI_UNLOCK(unp->un_dvp); } } /* * Create a new shadow dir. * * udvp should be locked on entry and will be locked on return. * * If no error returned, unp will be updated. */ int unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp, struct unionfs_node *unp, struct componentname *cnp, struct thread *td) { struct vnode *lvp; struct vnode *uvp; struct vattr va; struct vattr lva; struct nameidata nd; struct mount *mp; struct ucred *cred; struct ucred *credbk; struct uidinfo *rootinfo; int error; if (unp->un_uppervp != NULLVP) return (EEXIST); lvp = unp->un_lowervp; uvp = NULLVP; credbk = cnp->cn_cred; /* Authority change to root */ rootinfo = uifind((uid_t)0); cred = crdup(cnp->cn_cred); /* * The calls to chgproccnt() are needed to compensate for change_ruid() * calling chgproccnt(). */ chgproccnt(cred->cr_ruidinfo, 1, 0); change_euid(cred, rootinfo); change_ruid(cred, rootinfo); change_svuid(cred, (uid_t)0); uifree(rootinfo); cnp->cn_cred = cred; memset(&nd.ni_cnd, 0, sizeof(struct componentname)); NDPREINIT(&nd); if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred))) goto unionfs_mkshadowdir_abort; if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td, cnp->cn_nameptr, cnp->cn_namelen, CREATE))) goto unionfs_mkshadowdir_abort; if (uvp != NULLVP) { if (udvp == uvp) vrele(uvp); else vput(uvp); error = EEXIST; goto unionfs_mkshadowdir_abort; } if ((error = vn_start_write(udvp, &mp, V_WAIT | PCATCH))) goto unionfs_mkshadowdir_abort; unionfs_create_uppervattr_core(ump, &lva, &va, td); error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va); if (!error) { unionfs_node_update(unp, uvp, td); /* * XXX The bug which cannot set uid/gid was corrected. * Ignore errors. */ va.va_type = VNON; VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred); } vn_finished_write(mp); unionfs_mkshadowdir_abort: cnp->cn_cred = credbk; chgproccnt(cred->cr_ruidinfo, -1, 0); crfree(cred); return (error); } /* * Create a new whiteout. * * dvp should be locked on entry and will be locked on return. */ int unionfs_mkwhiteout(struct vnode *dvp, struct componentname *cnp, struct thread *td, char *path, int pathlen) { struct vnode *wvp; struct nameidata nd; struct mount *mp; int error; wvp = NULLVP; NDPREINIT(&nd); if ((error = unionfs_relookup(dvp, &wvp, cnp, &nd.ni_cnd, td, path, pathlen, CREATE))) { return (error); } if (wvp != NULLVP) { if (dvp == wvp) vrele(wvp); else vput(wvp); return (EEXIST); } if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH))) goto unionfs_mkwhiteout_free_out; error = VOP_WHITEOUT(dvp, &nd.ni_cnd, CREATE); vn_finished_write(mp); unionfs_mkwhiteout_free_out: return (error); } /* * Create a new vnode for create a new shadow file. * * If an error is returned, *vpp will be invalid, otherwise it will hold a * locked, referenced and opened vnode. * * unp is never updated. */ static int unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp, struct unionfs_node *unp, struct vattr *uvap, struct thread *td) { struct unionfs_mount *ump; struct vnode *vp; struct vnode *lvp; struct ucred *cred; struct vattr lva; struct nameidata nd; int fmode; int error; ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount); vp = NULLVP; lvp = unp->un_lowervp; cred = td->td_ucred; fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL); error = 0; if ((error = VOP_GETATTR(lvp, &lva, cred)) != 0) return (error); unionfs_create_uppervattr_core(ump, &lva, uvap, td); if (unp->un_path == NULL) panic("%s: NULL un_path", __func__); nd.ni_cnd.cn_namelen = unp->un_pathlen; nd.ni_cnd.cn_pnbuf = unp->un_path; nd.ni_cnd.cn_nameiop = CREATE; - nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | HASBUF | SAVENAME | - ISLASTCN; + nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | ISLASTCN; nd.ni_cnd.cn_lkflags = LK_EXCLUSIVE; nd.ni_cnd.cn_cred = cred; nd.ni_cnd.cn_nameptr = nd.ni_cnd.cn_pnbuf; NDPREINIT(&nd); vref(udvp); if ((error = vfs_relookup(udvp, &vp, &nd.ni_cnd)) != 0) goto unionfs_vn_create_on_upper_free_out2; vrele(udvp); if (vp != NULLVP) { if (vp == udvp) vrele(vp); else vput(vp); error = EEXIST; goto unionfs_vn_create_on_upper_free_out1; } if ((error = VOP_CREATE(udvp, &vp, &nd.ni_cnd, uvap)) != 0) goto unionfs_vn_create_on_upper_free_out1; if ((error = VOP_OPEN(vp, fmode, cred, td, NULL)) != 0) { vput(vp); goto unionfs_vn_create_on_upper_free_out1; } error = VOP_ADD_WRITECOUNT(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); if (error == 0) { *vpp = vp; } else { VOP_CLOSE(vp, fmode, cred, td); } unionfs_vn_create_on_upper_free_out1: VOP_UNLOCK(udvp); unionfs_vn_create_on_upper_free_out2: - KASSERT((nd.ni_cnd.cn_flags & HASBUF) != 0, - ("%s: HASBUF cleared", __func__)); - KASSERT((nd.ni_cnd.cn_flags & SAVENAME) != 0, - ("%s: SAVENAME cleared", __func__)); KASSERT(nd.ni_cnd.cn_pnbuf == unp->un_path, ("%s: cn_pnbuf changed", __func__)); return (error); } /* * Copy from lvp to uvp. * * lvp and uvp should be locked and opened on entry and will be locked and * opened on return. */ static int unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp, struct ucred *cred, struct thread *td) { char *buf; struct uio uio; struct iovec iov; off_t offset; int count; int error; int bufoffset; error = 0; memset(&uio, 0, sizeof(uio)); uio.uio_td = td; uio.uio_segflg = UIO_SYSSPACE; uio.uio_offset = 0; buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); while (error == 0) { offset = uio.uio_offset; uio.uio_iov = &iov; uio.uio_iovcnt = 1; iov.iov_base = buf; iov.iov_len = MAXBSIZE; uio.uio_resid = iov.iov_len; uio.uio_rw = UIO_READ; if ((error = VOP_READ(lvp, &uio, 0, cred)) != 0) break; if ((count = MAXBSIZE - uio.uio_resid) == 0) break; bufoffset = 0; while (bufoffset < count) { uio.uio_iov = &iov; uio.uio_iovcnt = 1; iov.iov_base = buf + bufoffset; iov.iov_len = count - bufoffset; uio.uio_offset = offset + bufoffset; uio.uio_resid = iov.iov_len; uio.uio_rw = UIO_WRITE; if ((error = VOP_WRITE(uvp, &uio, 0, cred)) != 0) break; bufoffset += (count - bufoffset) - uio.uio_resid; } uio.uio_offset = offset + bufoffset; } free(buf, M_TEMP); return (error); } /* * Copy file from lower to upper. * * If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to * docopy. * * If no error returned, unp will be updated. */ int unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred, struct thread *td) { struct mount *mp; struct vnode *udvp; struct vnode *lvp; struct vnode *uvp; struct vattr uva; int error; lvp = unp->un_lowervp; uvp = NULLVP; if ((UNIONFSTOV(unp)->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (unp->un_dvp == NULLVP) return (EINVAL); if (unp->un_uppervp != NULLVP) return (EEXIST); udvp = VTOUNIONFS(unp->un_dvp)->un_uppervp; if (udvp == NULLVP) return (EROFS); if ((udvp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); error = VOP_ACCESS(lvp, VREAD, cred, td); if (error != 0) return (error); if ((error = vn_start_write(udvp, &mp, V_WAIT | PCATCH)) != 0) return (error); error = unionfs_vn_create_on_upper(&uvp, udvp, unp, &uva, td); if (error != 0) { vn_finished_write(mp); return (error); } if (docopy != 0) { error = VOP_OPEN(lvp, FREAD, cred, td, NULL); if (error == 0) { error = unionfs_copyfile_core(lvp, uvp, cred, td); VOP_CLOSE(lvp, FREAD, cred, td); } } VOP_CLOSE(uvp, FWRITE, cred, td); VOP_ADD_WRITECOUNT_CHECKED(uvp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, uvp, uvp->v_writecount); vn_finished_write(mp); if (error == 0) { /* Reset the attributes. Ignore errors. */ uva.va_type = VNON; VOP_SETATTR(uvp, &uva, cred); } unionfs_node_update(unp, uvp, td); return (error); } /* * It checks whether vp can rmdir. (check empty) * * vp is unionfs vnode. * vp should be locked. */ int unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td) { struct vnode *uvp; struct vnode *lvp; struct vnode *tvp; struct dirent *dp; struct dirent *edp; struct componentname cn; struct iovec iov; struct uio uio; struct vattr va; int error; int eofflag; int lookuperr; /* * The size of buf needs to be larger than DIRBLKSIZ. */ char buf[256 * 6]; ASSERT_VOP_ELOCKED(vp, __func__); eofflag = 0; uvp = UNIONFSVPTOUPPERVP(vp); lvp = UNIONFSVPTOLOWERVP(vp); /* check opaque */ if ((error = VOP_GETATTR(uvp, &va, cred)) != 0) return (error); if (va.va_flags & OPAQUE) return (0); /* open vnode */ #ifdef MAC if ((error = mac_vnode_check_open(cred, vp, VEXEC|VREAD)) != 0) return (error); #endif if ((error = VOP_ACCESS(vp, VEXEC|VREAD, cred, td)) != 0) return (error); if ((error = VOP_OPEN(vp, FREAD, cred, td, NULL)) != 0) return (error); uio.uio_rw = UIO_READ; uio.uio_segflg = UIO_SYSSPACE; uio.uio_td = td; uio.uio_offset = 0; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, lvp); #endif while (!error && !eofflag) { iov.iov_base = buf; iov.iov_len = sizeof(buf); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_resid = iov.iov_len; error = VOP_READDIR(lvp, &uio, cred, &eofflag, NULL, NULL); if (error != 0) break; KASSERT(eofflag != 0 || uio.uio_resid < sizeof(buf), ("%s: empty read from lower FS", __func__)); edp = (struct dirent*)&buf[sizeof(buf) - uio.uio_resid]; for (dp = (struct dirent*)buf; !error && dp < edp; dp = (struct dirent*)((caddr_t)dp + dp->d_reclen)) { if (dp->d_type == DT_WHT || dp->d_fileno == 0 || (dp->d_namlen == 1 && dp->d_name[0] == '.') || (dp->d_namlen == 2 && !bcmp(dp->d_name, "..", 2))) continue; cn.cn_namelen = dp->d_namlen; cn.cn_pnbuf = NULL; cn.cn_nameptr = dp->d_name; cn.cn_nameiop = LOOKUP; - cn.cn_flags = LOCKPARENT | LOCKLEAF | SAVENAME | - RDONLY | ISLASTCN; + cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN; cn.cn_lkflags = LK_EXCLUSIVE; cn.cn_cred = cred; /* * check entry in lower. * Sometimes, readdir function returns * wrong entry. */ lookuperr = VOP_LOOKUP(lvp, &tvp, &cn); if (!lookuperr) vput(tvp); else continue; /* skip entry */ /* * check entry * If it has no exist/whiteout entry in upper, * directory is not empty. */ - cn.cn_flags = LOCKPARENT | LOCKLEAF | SAVENAME | - RDONLY | ISLASTCN; + cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN; lookuperr = VOP_LOOKUP(uvp, &tvp, &cn); if (!lookuperr) vput(tvp); /* ignore exist or whiteout entry */ if (!lookuperr || (lookuperr == ENOENT && (cn.cn_flags & ISWHITEOUT))) continue; error = ENOTEMPTY; } } /* close vnode */ VOP_CLOSE(vp, FREAD, cred, td); return (error); } diff --git a/sys/fs/unionfs/union_vnops.c b/sys/fs/unionfs/union_vnops.c index 8403805a99d4..04d788959892 100644 --- a/sys/fs/unionfs/union_vnops.c +++ b/sys/fs/unionfs/union_vnops.c @@ -1,2829 +1,2820 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry. * Copyright (c) 1992, 1993, 1994, 1995 * The Regents of the University of California. * Copyright (c) 2005, 2006, 2012 Masanori Ozawa , ONGS Inc. * Copyright (c) 2006, 2012 Daichi Goto * All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_vnops.c 8.32 (Berkeley) 6/23/95 * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if 0 #define UNIONFS_INTERNAL_DEBUG(msg, args...) printf(msg, ## args) #define UNIONFS_IDBG_RENAME #else #define UNIONFS_INTERNAL_DEBUG(msg, args...) #endif #define KASSERT_UNIONFS_VNODE(vp) \ VNASSERT(((vp)->v_op == &unionfs_vnodeops), vp, \ ("%s: non-unionfs vnode", __func__)) static int unionfs_lookup(struct vop_cachedlookup_args *ap) { struct unionfs_node *dunp; struct vnode *dvp, *udvp, *ldvp, *vp, *uvp, *lvp, *dtmpvp; struct vattr va; struct componentname *cnp; struct thread *td; u_long nameiop; u_long cnflags, cnflagsbk; int iswhiteout; int lockflag; int error , uerror, lerror; iswhiteout = 0; lockflag = 0; error = uerror = lerror = ENOENT; cnp = ap->a_cnp; nameiop = cnp->cn_nameiop; cnflags = cnp->cn_flags; dvp = ap->a_dvp; dunp = VTOUNIONFS(dvp); udvp = dunp->un_uppervp; ldvp = dunp->un_lowervp; vp = uvp = lvp = NULLVP; td = curthread; *(ap->a_vpp) = NULLVP; UNIONFS_INTERNAL_DEBUG( "unionfs_lookup: enter: nameiop=%ld, flags=%lx, path=%s\n", nameiop, cnflags, cnp->cn_nameptr); if (dvp->v_type != VDIR) return (ENOTDIR); /* * If read-only and op is not LOOKUP, will return EROFS. */ if ((cnflags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && LOOKUP != nameiop) return (EROFS); /* * lookup dotdot */ if (cnflags & ISDOTDOT) { if (LOOKUP != nameiop && udvp == NULLVP) return (EROFS); if (udvp != NULLVP) { dtmpvp = udvp; if (ldvp != NULLVP) VOP_UNLOCK(ldvp); } else dtmpvp = ldvp; error = VOP_LOOKUP(dtmpvp, &vp, cnp); if (dtmpvp == udvp && ldvp != NULLVP) { VOP_UNLOCK(udvp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } if (error == 0) { /* * Exchange lock and reference from vp to * dunp->un_dvp. vp is upper/lower vnode, but it * will need to return the unionfs vnode. */ if (nameiop == DELETE || nameiop == RENAME || (cnp->cn_lkflags & LK_TYPE_MASK)) VOP_UNLOCK(vp); vrele(vp); VOP_UNLOCK(dvp); *(ap->a_vpp) = dunp->un_dvp; vref(dunp->un_dvp); if (nameiop == DELETE || nameiop == RENAME) vn_lock(dunp->un_dvp, LK_EXCLUSIVE | LK_RETRY); else if (cnp->cn_lkflags & LK_TYPE_MASK) vn_lock(dunp->un_dvp, cnp->cn_lkflags | LK_RETRY); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } else if (error == ENOENT && (cnflags & MAKEENTRY) != 0) cache_enter(dvp, NULLVP, cnp); goto unionfs_lookup_return; } /* * lookup upper layer */ if (udvp != NULLVP) { uerror = VOP_LOOKUP(udvp, &uvp, cnp); if (uerror == 0) { if (udvp == uvp) { /* is dot */ vrele(uvp); *(ap->a_vpp) = dvp; vref(dvp); error = uerror; goto unionfs_lookup_return; } if (nameiop == DELETE || nameiop == RENAME || (cnp->cn_lkflags & LK_TYPE_MASK)) VOP_UNLOCK(uvp); } /* check whiteout */ if (uerror == ENOENT || uerror == EJUSTRETURN) if (cnp->cn_flags & ISWHITEOUT) iswhiteout = 1; /* don't lookup lower */ if (iswhiteout == 0 && ldvp != NULLVP) if (!VOP_GETATTR(udvp, &va, cnp->cn_cred) && (va.va_flags & OPAQUE)) iswhiteout = 1; /* don't lookup lower */ #if 0 UNIONFS_INTERNAL_DEBUG( "unionfs_lookup: debug: whiteout=%d, path=%s\n", iswhiteout, cnp->cn_nameptr); #endif } /* * lookup lower layer */ if (ldvp != NULLVP && !(cnflags & DOWHITEOUT) && iswhiteout == 0) { /* always op is LOOKUP */ cnp->cn_nameiop = LOOKUP; cnflagsbk = cnp->cn_flags; cnp->cn_flags = cnflags; lerror = VOP_LOOKUP(ldvp, &lvp, cnp); cnp->cn_nameiop = nameiop; if (udvp != NULLVP && (uerror == 0 || uerror == EJUSTRETURN)) cnp->cn_flags = cnflagsbk; if (lerror == 0) { if (ldvp == lvp) { /* is dot */ if (uvp != NULLVP) vrele(uvp); /* no need? */ vrele(lvp); *(ap->a_vpp) = dvp; vref(dvp); UNIONFS_INTERNAL_DEBUG( "unionfs_lookup: leave (%d)\n", lerror); return (lerror); } if (cnp->cn_lkflags & LK_TYPE_MASK) VOP_UNLOCK(lvp); } } /* * check lookup result */ if (uvp == NULLVP && lvp == NULLVP) { error = (udvp != NULLVP ? uerror : lerror); goto unionfs_lookup_return; } /* * check vnode type */ if (uvp != NULLVP && lvp != NULLVP && uvp->v_type != lvp->v_type) { vrele(lvp); lvp = NULLVP; } /* * check shadow dir */ if (uerror != 0 && uerror != EJUSTRETURN && udvp != NULLVP && lerror == 0 && lvp != NULLVP && lvp->v_type == VDIR && !(dvp->v_mount->mnt_flag & MNT_RDONLY) && (1 < cnp->cn_namelen || '.' != *(cnp->cn_nameptr))) { /* get unionfs vnode in order to create a new shadow dir. */ error = unionfs_nodeget(dvp->v_mount, NULLVP, lvp, dvp, &vp, cnp); if (error != 0) goto unionfs_lookup_cleanup; if (LK_SHARED == (cnp->cn_lkflags & LK_TYPE_MASK)) VOP_UNLOCK(vp); if (LK_EXCLUSIVE != VOP_ISLOCKED(vp)) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); lockflag = 1; } error = unionfs_mkshadowdir(MOUNTTOUNIONFSMOUNT(dvp->v_mount), udvp, VTOUNIONFS(vp), cnp, td); if (lockflag != 0) VOP_UNLOCK(vp); if (error != 0) { UNIONFSDEBUG( "unionfs_lookup: Unable to create shadow dir."); if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) vput(vp); else vrele(vp); goto unionfs_lookup_cleanup; } if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_SHARED) vn_lock(vp, LK_SHARED | LK_RETRY); } /* * get unionfs vnode. */ else { if (uvp != NULLVP) error = uerror; else error = lerror; if (error != 0) goto unionfs_lookup_cleanup; /* * get socket vnode. */ if (uvp != NULLVP && uvp->v_type == VSOCK) { vp = uvp; vref(vp); if (cnp->cn_lkflags & LK_TYPE_MASK) vn_lock(vp, cnp->cn_lkflags | LK_RETRY); } else if (lvp != NULLVP && lvp->v_type == VSOCK) { vp = lvp; vref(vp); if (cnp->cn_lkflags & LK_TYPE_MASK) vn_lock(vp, cnp->cn_lkflags | LK_RETRY); } /* * get unionfs vnode. */ else error = unionfs_nodeget(dvp->v_mount, uvp, lvp, dvp, &vp, cnp); if (error != 0) { UNIONFSDEBUG( "unionfs_lookup: Unable to create unionfs vnode."); goto unionfs_lookup_cleanup; } if ((nameiop == DELETE || nameiop == RENAME) && (cnp->cn_lkflags & LK_TYPE_MASK) == 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } *(ap->a_vpp) = vp; if ((cnflags & MAKEENTRY) && vp->v_type != VSOCK) cache_enter(dvp, vp, cnp); unionfs_lookup_cleanup: if (uvp != NULLVP) vrele(uvp); if (lvp != NULLVP) vrele(lvp); if (error == ENOENT && (cnflags & MAKEENTRY) != 0) cache_enter(dvp, NULLVP, cnp); unionfs_lookup_return: - /* Ensure subsequent vnops will get a valid pathname buffer. */ - if (nameiop != LOOKUP && (error == 0 || error == EJUSTRETURN)) - cnp->cn_flags |= SAVENAME; - UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error); return (error); } static int unionfs_create(struct vop_create_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_create: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; udvp = dunp->un_uppervp; error = EROFS; if (udvp != NULLVP) { error = VOP_CREATE(udvp, &vp, cnp, ap->a_vap); if (error != 0) goto unionfs_create_abort; if (vp->v_type == VSOCK) *(ap->a_vpp) = vp; else { VOP_UNLOCK(vp); error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP, ap->a_dvp, ap->a_vpp, cnp); vrele(vp); } } unionfs_create_abort: UNIONFS_INTERNAL_DEBUG("unionfs_create: leave (%d)\n", error); return (error); } static int unionfs_whiteout(struct vop_whiteout_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; udvp = dunp->un_uppervp; error = EOPNOTSUPP; if (udvp != NULLVP) { switch (ap->a_flags) { case CREATE: case DELETE: case LOOKUP: error = VOP_WHITEOUT(udvp, cnp, ap->a_flags); break; default: error = EINVAL; break; } } UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: leave (%d)\n", error); return (error); } static int unionfs_mknod(struct vop_mknod_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_mknod: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; udvp = dunp->un_uppervp; error = EROFS; if (udvp != NULLVP) { error = VOP_MKNOD(udvp, &vp, cnp, ap->a_vap); if (error != 0) goto unionfs_mknod_abort; if (vp->v_type == VSOCK) *(ap->a_vpp) = vp; else { VOP_UNLOCK(vp); error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP, ap->a_dvp, ap->a_vpp, cnp); vrele(vp); } } unionfs_mknod_abort: UNIONFS_INTERNAL_DEBUG("unionfs_mknod: leave (%d)\n", error); return (error); } enum unionfs_lkupgrade { UNIONFS_LKUPGRADE_SUCCESS, /* lock successfully upgraded */ UNIONFS_LKUPGRADE_ALREADY, /* lock already held exclusive */ UNIONFS_LKUPGRADE_DOOMED /* lock was upgraded, but vnode reclaimed */ }; static inline enum unionfs_lkupgrade unionfs_upgrade_lock(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) return (UNIONFS_LKUPGRADE_ALREADY); if (vn_lock(vp, LK_UPGRADE) != 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(vp)) return (UNIONFS_LKUPGRADE_DOOMED); } return (UNIONFS_LKUPGRADE_SUCCESS); } static inline void unionfs_downgrade_lock(struct vnode *vp, enum unionfs_lkupgrade status) { if (status != UNIONFS_LKUPGRADE_ALREADY) vn_lock(vp, LK_DOWNGRADE | LK_RETRY); } static int unionfs_open(struct vop_open_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *vp; struct vnode *uvp; struct vnode *lvp; struct vnode *targetvp; struct ucred *cred; struct thread *td; int error; enum unionfs_lkupgrade lkstatus; UNIONFS_INTERNAL_DEBUG("unionfs_open: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; vp = ap->a_vp; targetvp = NULLVP; cred = ap->a_cred; td = ap->a_td; /* * The executable loader path may call this function with vp locked * shared. If the vnode is reclaimed while upgrading, we can't safely * use unp or do anything else unionfs- specific. */ lkstatus = unionfs_upgrade_lock(vp); if (lkstatus == UNIONFS_LKUPGRADE_DOOMED) { error = ENOENT; goto unionfs_open_cleanup; } unp = VTOUNIONFS(vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt > 0 || unsp->uns_upper_opencnt > 0) { /* vnode is already opend. */ if (unsp->uns_upper_opencnt > 0) targetvp = uvp; else targetvp = lvp; if (targetvp == lvp && (ap->a_mode & FWRITE) && lvp->v_type == VREG) targetvp = NULLVP; } if (targetvp == NULLVP) { if (uvp == NULLVP) { if ((ap->a_mode & FWRITE) && lvp->v_type == VREG) { error = unionfs_copyfile(unp, !(ap->a_mode & O_TRUNC), cred, td); if (error != 0) goto unionfs_open_abort; targetvp = uvp = unp->un_uppervp; } else targetvp = lvp; } else targetvp = uvp; } error = VOP_OPEN(targetvp, ap->a_mode, cred, td, ap->a_fp); if (error == 0) { if (targetvp == uvp) { if (uvp->v_type == VDIR && lvp != NULLVP && unsp->uns_lower_opencnt <= 0) { /* open lower for readdir */ error = VOP_OPEN(lvp, FREAD, cred, td, NULL); if (error != 0) { VOP_CLOSE(uvp, ap->a_mode, cred, td); goto unionfs_open_abort; } unsp->uns_node_flag |= UNS_OPENL_4_READDIR; unsp->uns_lower_opencnt++; } unsp->uns_upper_opencnt++; } else { unsp->uns_lower_opencnt++; unsp->uns_lower_openmode = ap->a_mode; } vp->v_object = targetvp->v_object; } unionfs_open_abort: if (error != 0) unionfs_tryrem_node_status(unp, unsp); unionfs_open_cleanup: unionfs_downgrade_lock(vp, lkstatus); UNIONFS_INTERNAL_DEBUG("unionfs_open: leave (%d)\n", error); return (error); } static int unionfs_close(struct vop_close_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct ucred *cred; struct thread *td; struct vnode *vp; struct vnode *ovp; int error; enum unionfs_lkupgrade lkstatus;; UNIONFS_INTERNAL_DEBUG("unionfs_close: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); vp = ap->a_vp; cred = ap->a_cred; td = ap->a_td; error = 0; /* * If the vnode is reclaimed while upgrading, we can't safely use unp * or do anything else unionfs- specific. */ lkstatus = unionfs_upgrade_lock(vp); if (lkstatus == UNIONFS_LKUPGRADE_DOOMED) goto unionfs_close_cleanup; unp = VTOUNIONFS(vp); unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0) { #ifdef DIAGNOSTIC printf("unionfs_close: warning: open count is 0\n"); #endif if (unp->un_uppervp != NULLVP) ovp = unp->un_uppervp; else ovp = unp->un_lowervp; } else if (unsp->uns_upper_opencnt > 0) ovp = unp->un_uppervp; else ovp = unp->un_lowervp; error = VOP_CLOSE(ovp, ap->a_fflag, cred, td); if (error != 0) goto unionfs_close_abort; vp->v_object = ovp->v_object; if (ovp == unp->un_uppervp) { unsp->uns_upper_opencnt--; if (unsp->uns_upper_opencnt == 0) { if (unsp->uns_node_flag & UNS_OPENL_4_READDIR) { VOP_CLOSE(unp->un_lowervp, FREAD, cred, td); unsp->uns_node_flag &= ~UNS_OPENL_4_READDIR; unsp->uns_lower_opencnt--; } if (unsp->uns_lower_opencnt > 0) vp->v_object = unp->un_lowervp->v_object; } } else unsp->uns_lower_opencnt--; unionfs_close_abort: unionfs_tryrem_node_status(unp, unsp); unionfs_close_cleanup: unionfs_downgrade_lock(vp, lkstatus); UNIONFS_INTERNAL_DEBUG("unionfs_close: leave (%d)\n", error); return (error); } /* * Check the access mode toward shadow file/dir. */ static int unionfs_check_corrected_access(accmode_t accmode, struct vattr *va, struct ucred *cred) { uid_t uid; /* upper side vnode's uid */ gid_t gid; /* upper side vnode's gid */ u_short vmode; /* upper side vnode's mode */ u_short mask; mask = 0; uid = va->va_uid; gid = va->va_gid; vmode = va->va_mode; /* check owner */ if (cred->cr_uid == uid) { if (accmode & VEXEC) mask |= S_IXUSR; if (accmode & VREAD) mask |= S_IRUSR; if (accmode & VWRITE) mask |= S_IWUSR; return ((vmode & mask) == mask ? 0 : EACCES); } /* check group */ if (groupmember(gid, cred)) { if (accmode & VEXEC) mask |= S_IXGRP; if (accmode & VREAD) mask |= S_IRGRP; if (accmode & VWRITE) mask |= S_IWGRP; return ((vmode & mask) == mask ? 0 : EACCES); } /* check other */ if (accmode & VEXEC) mask |= S_IXOTH; if (accmode & VREAD) mask |= S_IROTH; if (accmode & VWRITE) mask |= S_IWOTH; return ((vmode & mask) == mask ? 0 : EACCES); } static int unionfs_access(struct vop_access_args *ap) { struct unionfs_mount *ump; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; accmode_t accmode; int error; UNIONFS_INTERNAL_DEBUG("unionfs_access: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; accmode = ap->a_accmode; error = EACCES; if ((accmode & VWRITE) && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (ap->a_vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } if (uvp != NULLVP) { error = VOP_ACCESS(uvp, accmode, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error); return (error); } if (lvp != NULLVP) { if (accmode & VWRITE) { if (ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY) { switch (ap->a_vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } else if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) { /* check shadow file/dir */ if (ump->um_copymode != UNIONFS_TRANSPARENT) { error = unionfs_create_uppervattr(ump, lvp, &va, ap->a_cred, td); if (error != 0) return (error); error = unionfs_check_corrected_access( accmode, &va, ap->a_cred); if (error != 0) return (error); } } accmode &= ~(VWRITE | VAPPEND); accmode |= VREAD; /* will copy to upper */ } error = VOP_ACCESS(lvp, accmode, ap->a_cred, td); } UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error); return (error); } static int unionfs_getattr(struct vop_getattr_args *ap) { struct unionfs_node *unp; struct unionfs_mount *ump; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; int error; UNIONFS_INTERNAL_DEBUG("unionfs_getattr: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = curthread; if (uvp != NULLVP) { if ((error = VOP_GETATTR(uvp, ap->a_vap, ap->a_cred)) == 0) ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; UNIONFS_INTERNAL_DEBUG( "unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n", ap->a_vap->va_mode, ap->a_vap->va_uid, ap->a_vap->va_gid, error); return (error); } error = VOP_GETATTR(lvp, ap->a_vap, ap->a_cred); if (error == 0 && !(ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY)) { /* correct the attr toward shadow file/dir. */ if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) { unionfs_create_uppervattr_core(ump, ap->a_vap, &va, td); ap->a_vap->va_mode = va.va_mode; ap->a_vap->va_uid = va.va_uid; ap->a_vap->va_gid = va.va_gid; } } if (error == 0) ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; UNIONFS_INTERNAL_DEBUG( "unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n", ap->a_vap->va_mode, ap->a_vap->va_uid, ap->a_vap->va_gid, error); return (error); } static int unionfs_setattr(struct vop_setattr_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr *vap; int error; UNIONFS_INTERNAL_DEBUG("unionfs_setattr: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = curthread; vap = ap->a_vap; if ((ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) && (vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { error = unionfs_copyfile(unp, (vap->va_size != 0), ap->a_cred, td); if (error != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) error = VOP_SETATTR(uvp, vap, ap->a_cred); UNIONFS_INTERNAL_DEBUG("unionfs_setattr: leave (%d)\n", error); return (error); } static int unionfs_read(struct vop_read_args *ap) { struct unionfs_node *unp; struct vnode *tvp; int error; /* UNIONFS_INTERNAL_DEBUG("unionfs_read: enter\n"); */ KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_READ(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred); /* UNIONFS_INTERNAL_DEBUG("unionfs_read: leave (%d)\n", error); */ return (error); } static int unionfs_write(struct vop_write_args *ap) { struct unionfs_node *unp; struct vnode *tvp; int error; /* UNIONFS_INTERNAL_DEBUG("unionfs_write: enter\n"); */ KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_WRITE(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred); /* UNIONFS_INTERNAL_DEBUG("unionfs_write: leave (%d)\n", error); */ return (error); } static int unionfs_ioctl(struct vop_ioctl_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); unionfs_tryrem_node_status(unp, unsp); VOP_UNLOCK(ap->a_vp); if (ovp == NULLVP) return (EBADF); error = VOP_IOCTL(ovp, ap->a_command, ap->a_data, ap->a_fflag, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: leave (%d)\n", error); return (error); } static int unionfs_poll(struct vop_poll_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; KASSERT_UNIONFS_VNODE(ap->a_vp); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); unionfs_tryrem_node_status(unp, unsp); VOP_UNLOCK(ap->a_vp); if (ovp == NULLVP) return (EBADF); return (VOP_POLL(ovp, ap->a_events, ap->a_cred, ap->a_td)); } static int unionfs_fsync(struct vop_fsync_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); unionfs_tryrem_node_status(unp, unsp); if (ovp == NULLVP) return (EBADF); return (VOP_FSYNC(ovp, ap->a_waitfor, ap->a_td)); } static int unionfs_remove(struct vop_remove_args *ap) { char *path; struct unionfs_node *dunp; struct unionfs_node *unp; struct unionfs_mount *ump; struct vnode *udvp; struct vnode *uvp; struct vnode *lvp; struct vnode *vp; struct componentname *cnp; struct componentname cn; struct thread *td; int error; int pathlen; UNIONFS_INTERNAL_DEBUG("unionfs_remove: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); error = 0; dunp = VTOUNIONFS(ap->a_dvp); udvp = dunp->un_uppervp; cnp = ap->a_cnp; td = curthread; if (ap->a_vp->v_op != &unionfs_vnodeops) { if (ap->a_vp->v_type != VSOCK) return (EINVAL); ump = NULL; vp = uvp = lvp = NULLVP; /* search vnode */ VOP_UNLOCK(ap->a_vp); error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr, cnp->cn_namelen, DELETE); if (error != 0 && error != ENOENT) { vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); return (error); } if (error == 0 && vp == ap->a_vp) { /* target vnode in upper */ uvp = vp; vrele(vp); } else { /* target vnode in lower */ if (vp != NULLVP) { if (udvp == vp) vrele(vp); else vput(vp); } vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); lvp = ap->a_vp; } path = cnp->cn_nameptr; pathlen = cnp->cn_namelen; } else { ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; path = unp->un_path; pathlen = unp->un_pathlen; } if (udvp == NULLVP) return (EROFS); if (uvp != NULLVP) { /* * XXX: if the vnode type is VSOCK, it will create whiteout * after remove. */ if (ump == NULL || ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP) cnp->cn_flags |= DOWHITEOUT; error = VOP_REMOVE(udvp, uvp, cnp); } else if (lvp != NULLVP) error = unionfs_mkwhiteout(udvp, cnp, td, path, pathlen); UNIONFS_INTERNAL_DEBUG("unionfs_remove: leave (%d)\n", error); return (error); } static int unionfs_link(struct vop_link_args *ap) { struct unionfs_node *dunp; struct unionfs_node *unp; struct vnode *udvp; struct vnode *uvp; struct componentname *cnp; struct thread *td; int error; int needrelookup; UNIONFS_INTERNAL_DEBUG("unionfs_link: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_tdvp); KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; needrelookup = 0; dunp = VTOUNIONFS(ap->a_tdvp); unp = NULL; udvp = dunp->un_uppervp; uvp = NULLVP; cnp = ap->a_cnp; td = curthread; if (udvp == NULLVP) return (EROFS); if (ap->a_vp->v_op != &unionfs_vnodeops) uvp = ap->a_vp; else { unp = VTOUNIONFS(ap->a_vp); if (unp->un_uppervp == NULLVP) { if (ap->a_vp->v_type != VREG) return (EOPNOTSUPP); error = unionfs_copyfile(unp, 1, cnp->cn_cred, td); if (error != 0) return (error); needrelookup = 1; } uvp = unp->un_uppervp; } if (needrelookup != 0) error = unionfs_relookup_for_create(ap->a_tdvp, cnp, td); if (error == 0) error = VOP_LINK(udvp, uvp, cnp); UNIONFS_INTERNAL_DEBUG("unionfs_link: leave (%d)\n", error); return (error); } static int unionfs_rename(struct vop_rename_args *ap) { struct vnode *fdvp; struct vnode *fvp; struct componentname *fcnp; struct vnode *tdvp; struct vnode *tvp; struct componentname *tcnp; struct vnode *ltdvp; struct vnode *ltvp; struct thread *td; /* rename target vnodes */ struct vnode *rfdvp; struct vnode *rfvp; struct vnode *rtdvp; struct vnode *rtvp; struct unionfs_mount *ump; struct unionfs_node *unp; int error; int needrelookup; UNIONFS_INTERNAL_DEBUG("unionfs_rename: enter\n"); error = 0; fdvp = ap->a_fdvp; fvp = ap->a_fvp; fcnp = ap->a_fcnp; tdvp = ap->a_tdvp; tvp = ap->a_tvp; tcnp = ap->a_tcnp; ltdvp = NULLVP; ltvp = NULLVP; td = curthread; rfdvp = fdvp; rfvp = fvp; rtdvp = tdvp; rtvp = tvp; needrelookup = 0; -#ifdef DIAGNOSTIC - if (!(fcnp->cn_flags & HASBUF) || !(tcnp->cn_flags & HASBUF)) - panic("unionfs_rename: no name"); -#endif - /* check for cross device rename */ if (fvp->v_mount != tdvp->v_mount || (tvp != NULLVP && fvp->v_mount != tvp->v_mount)) { if (fvp->v_op != &unionfs_vnodeops) error = ENODEV; else error = EXDEV; goto unionfs_rename_abort; } /* Renaming a file to itself has no effect. */ if (fvp == tvp) goto unionfs_rename_abort; /* * from/to vnode is unionfs node. */ KASSERT_UNIONFS_VNODE(fdvp); KASSERT_UNIONFS_VNODE(fvp); KASSERT_UNIONFS_VNODE(tdvp); if (tvp != NULLVP) KASSERT_UNIONFS_VNODE(tvp); unp = VTOUNIONFS(fdvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("fdvp=%p, ufdvp=%p, lfdvp=%p\n", fdvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) { error = ENODEV; goto unionfs_rename_abort; } rfdvp = unp->un_uppervp; vref(rfdvp); unp = VTOUNIONFS(fvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("fvp=%p, ufvp=%p, lfvp=%p\n", fvp, unp->un_uppervp, unp->un_lowervp); #endif ump = MOUNTTOUNIONFSMOUNT(fvp->v_mount); if (unp->un_uppervp == NULLVP) { switch (fvp->v_type) { case VREG: if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto unionfs_rename_abort; error = unionfs_copyfile(unp, 1, fcnp->cn_cred, td); VOP_UNLOCK(fvp); if (error != 0) goto unionfs_rename_abort; break; case VDIR: if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto unionfs_rename_abort; error = unionfs_mkshadowdir(ump, rfdvp, unp, fcnp, td); VOP_UNLOCK(fvp); if (error != 0) goto unionfs_rename_abort; break; default: error = ENODEV; goto unionfs_rename_abort; } needrelookup = 1; } if (unp->un_lowervp != NULLVP) fcnp->cn_flags |= DOWHITEOUT; rfvp = unp->un_uppervp; vref(rfvp); unp = VTOUNIONFS(tdvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("tdvp=%p, utdvp=%p, ltdvp=%p\n", tdvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) { error = ENODEV; goto unionfs_rename_abort; } rtdvp = unp->un_uppervp; ltdvp = unp->un_lowervp; vref(rtdvp); if (tdvp == tvp) { rtvp = rtdvp; vref(rtvp); } else if (tvp != NULLVP) { unp = VTOUNIONFS(tvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("tvp=%p, utvp=%p, ltvp=%p\n", tvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) rtvp = NULLVP; else { if (tvp->v_type == VDIR) { error = EINVAL; goto unionfs_rename_abort; } rtvp = unp->un_uppervp; ltvp = unp->un_lowervp; vref(rtvp); } } if (rfvp == rtvp) goto unionfs_rename_abort; if (needrelookup != 0) { if ((error = vn_lock(fdvp, LK_EXCLUSIVE)) != 0) goto unionfs_rename_abort; error = unionfs_relookup_for_delete(fdvp, fcnp, td); VOP_UNLOCK(fdvp); if (error != 0) goto unionfs_rename_abort; /* Lock of tvp is canceled in order to avoid recursive lock. */ if (tvp != NULLVP && tvp != tdvp) VOP_UNLOCK(tvp); error = unionfs_relookup_for_rename(tdvp, tcnp, td); if (tvp != NULLVP && tvp != tdvp) vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto unionfs_rename_abort; } error = VOP_RENAME(rfdvp, rfvp, fcnp, rtdvp, rtvp, tcnp); if (error == 0) { if (rtvp != NULLVP && rtvp->v_type == VDIR) cache_purge(tdvp); if (fvp->v_type == VDIR && fdvp != tdvp) cache_purge(fdvp); } if (ltdvp != NULLVP) VOP_UNLOCK(ltdvp); if (tdvp != rtdvp) vrele(tdvp); if (ltvp != NULLVP) VOP_UNLOCK(ltvp); if (tvp != rtvp && tvp != NULLVP) { if (rtvp == NULLVP) vput(tvp); else vrele(tvp); } if (fdvp != rfdvp) vrele(fdvp); if (fvp != rfvp) vrele(fvp); UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error); return (error); unionfs_rename_abort: vput(tdvp); if (tdvp != rtdvp) vrele(rtdvp); if (tvp != NULLVP) { if (tdvp != tvp) vput(tvp); else vrele(tvp); } if (tvp != rtvp && rtvp != NULLVP) vrele(rtvp); if (fdvp != rfdvp) vrele(rfdvp); if (fvp != rfvp) vrele(rfvp); vrele(fdvp); vrele(fvp); UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error); return (error); } static int unionfs_mkdir(struct vop_mkdir_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; struct vnode *uvp; struct vattr va; int error; int lkflags; UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); error = EROFS; dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; lkflags = cnp->cn_lkflags; udvp = dunp->un_uppervp; if (udvp != NULLVP) { /* check opaque */ if (!(cnp->cn_flags & ISWHITEOUT)) { error = VOP_GETATTR(udvp, &va, cnp->cn_cred); if (error != 0) return (error); if ((va.va_flags & OPAQUE) != 0) cnp->cn_flags |= ISWHITEOUT; } if ((error = VOP_MKDIR(udvp, &uvp, cnp, ap->a_vap)) == 0) { VOP_UNLOCK(uvp); cnp->cn_lkflags = LK_EXCLUSIVE; error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP, ap->a_dvp, ap->a_vpp, cnp); cnp->cn_lkflags = lkflags; vrele(uvp); } } UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: leave (%d)\n", error); return (error); } static int unionfs_rmdir(struct vop_rmdir_args *ap) { struct unionfs_node *dunp; struct unionfs_node *unp; struct unionfs_mount *ump; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *uvp; struct vnode *lvp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; dunp = VTOUNIONFS(ap->a_dvp); unp = VTOUNIONFS(ap->a_vp); cnp = ap->a_cnp; td = curthread; udvp = dunp->un_uppervp; uvp = unp->un_uppervp; lvp = unp->un_lowervp; if (udvp == NULLVP) return (EROFS); if (udvp == uvp) return (EOPNOTSUPP); if (uvp != NULLVP) { if (lvp != NULLVP) { error = unionfs_check_rmdir(ap->a_vp, cnp->cn_cred, td); if (error != 0) return (error); } ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP) cnp->cn_flags |= DOWHITEOUT; /* * The relookup path will need to relock the parent dvp and * possibly the vp as well. Locking is expected to be done * in parent->child order; drop the lock on vp to avoid LOR * and potential recursion on vp's lock. * vp is expected to remain referenced during VOP_RMDIR(), * so vref/vrele should not be necessary here. */ VOP_UNLOCK(ap->a_vp); VNPASS(vrefcnt(ap->a_vp) > 0, ap->a_vp); error = unionfs_relookup_for_delete(ap->a_dvp, cnp, td); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); /* * VOP_RMDIR is dispatched against udvp, so if uvp became * doomed while the lock was dropped above the target * filesystem may not be able to cope. */ if (error == 0 && VN_IS_DOOMED(uvp)) error = ENOENT; if (error == 0) error = VOP_RMDIR(udvp, uvp, cnp); } else if (lvp != NULLVP) error = unionfs_mkwhiteout(udvp, cnp, td, unp->un_path, unp->un_pathlen); if (error == 0) { cache_purge(ap->a_dvp); cache_purge(ap->a_vp); } UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: leave (%d)\n", error); return (error); } static int unionfs_symlink(struct vop_symlink_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; struct vnode *uvp; int error; int lkflags; UNIONFS_INTERNAL_DEBUG("unionfs_symlink: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); error = EROFS; dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; lkflags = cnp->cn_lkflags; udvp = dunp->un_uppervp; if (udvp != NULLVP) { error = VOP_SYMLINK(udvp, &uvp, cnp, ap->a_vap, ap->a_target); if (error == 0) { VOP_UNLOCK(uvp); cnp->cn_lkflags = LK_EXCLUSIVE; error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP, ap->a_dvp, ap->a_vpp, cnp); cnp->cn_lkflags = lkflags; vrele(uvp); } } UNIONFS_INTERNAL_DEBUG("unionfs_symlink: leave (%d)\n", error); return (error); } static int unionfs_readdir(struct vop_readdir_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct uio *uio; struct vnode *vp; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; uint64_t *cookies_bk; int error; int eofflag; int ncookies_bk; int uio_offset_bk; enum unionfs_lkupgrade lkstatus; UNIONFS_INTERNAL_DEBUG("unionfs_readdir: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; eofflag = 0; uio_offset_bk = 0; uio = ap->a_uio; uvp = NULLVP; lvp = NULLVP; td = uio->uio_td; ncookies_bk = 0; cookies_bk = NULL; vp = ap->a_vp; if (vp->v_type != VDIR) return (ENOTDIR); /* * If the vnode is reclaimed while upgrading, we can't safely use unp * or do anything else unionfs- specific. */ lkstatus = unionfs_upgrade_lock(vp); if (lkstatus == UNIONFS_LKUPGRADE_DOOMED) error = EBADF; if (error == 0) { unp = VTOUNIONFS(vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; /* check the open count. unionfs needs open before readdir. */ unionfs_get_node_status(unp, td, &unsp); if ((uvp != NULLVP && unsp->uns_upper_opencnt <= 0) || (lvp != NULLVP && unsp->uns_lower_opencnt <= 0)) { unionfs_tryrem_node_status(unp, unsp); error = EBADF; } } unionfs_downgrade_lock(vp, lkstatus); if (error != 0) goto unionfs_readdir_exit; /* check opaque */ if (uvp != NULLVP && lvp != NULLVP) { if ((error = VOP_GETATTR(uvp, &va, ap->a_cred)) != 0) goto unionfs_readdir_exit; if (va.va_flags & OPAQUE) lvp = NULLVP; } /* upper only */ if (uvp != NULLVP && lvp == NULLVP) { error = VOP_READDIR(uvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); unsp->uns_readdir_status = 0; goto unionfs_readdir_exit; } /* lower only */ if (uvp == NULLVP && lvp != NULLVP) { error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); unsp->uns_readdir_status = 2; goto unionfs_readdir_exit; } /* * readdir upper and lower */ KASSERT(uvp != NULLVP, ("unionfs_readdir: null upper vp")); KASSERT(lvp != NULLVP, ("unionfs_readdir: null lower vp")); if (uio->uio_offset == 0) unsp->uns_readdir_status = 0; if (unsp->uns_readdir_status == 0) { /* read upper */ error = VOP_READDIR(uvp, uio, ap->a_cred, &eofflag, ap->a_ncookies, ap->a_cookies); if (error != 0 || eofflag == 0) goto unionfs_readdir_exit; unsp->uns_readdir_status = 1; /* * UFS(and other FS) needs size of uio_resid larger than * DIRBLKSIZ. * size of DIRBLKSIZ equals DEV_BSIZE. * (see: ufs/ufs/ufs_vnops.c ufs_readdir func , ufs/ufs/dir.h) */ if (uio->uio_resid <= (uio->uio_resid & (DEV_BSIZE -1))) goto unionfs_readdir_exit; /* * Backup cookies. * It prepares to readdir in lower. */ if (ap->a_ncookies != NULL) { ncookies_bk = *(ap->a_ncookies); *(ap->a_ncookies) = 0; } if (ap->a_cookies != NULL) { cookies_bk = *(ap->a_cookies); *(ap->a_cookies) = NULL; } } /* initialize for readdir in lower */ if (unsp->uns_readdir_status == 1) { unsp->uns_readdir_status = 2; /* * Backup uio_offset. See the comment after the * VOP_READDIR call on the lower layer. */ uio_offset_bk = uio->uio_offset; uio->uio_offset = 0; } if (lvp == NULLVP) { error = EBADF; goto unionfs_readdir_exit; } /* read lower */ error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); /* * We can't return an uio_offset of 0: this would trigger an * infinite loop, because the next call to unionfs_readdir would * always restart with the upper layer (uio_offset == 0) and * always return some data. * * This happens when the lower layer root directory is removed. * (A root directory deleting of unionfs should not be permitted. * But current VFS can not do it.) */ if (uio->uio_offset == 0) uio->uio_offset = uio_offset_bk; if (cookies_bk != NULL) { /* merge cookies */ int size; uint64_t *newcookies, *pos; size = *(ap->a_ncookies) + ncookies_bk; newcookies = (uint64_t *) malloc(size * sizeof(*newcookies), M_TEMP, M_WAITOK); pos = newcookies; memcpy(pos, cookies_bk, ncookies_bk * sizeof(*newcookies)); pos += ncookies_bk; memcpy(pos, *(ap->a_cookies), *(ap->a_ncookies) * sizeof(*newcookies)); free(cookies_bk, M_TEMP); free(*(ap->a_cookies), M_TEMP); *(ap->a_ncookies) = size; *(ap->a_cookies) = newcookies; } unionfs_readdir_exit: if (error != 0 && ap->a_eofflag != NULL) *(ap->a_eofflag) = 1; UNIONFS_INTERNAL_DEBUG("unionfs_readdir: leave (%d)\n", error); return (error); } static int unionfs_readlink(struct vop_readlink_args *ap) { struct unionfs_node *unp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_readlink: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_READLINK(vp, ap->a_uio, ap->a_cred); UNIONFS_INTERNAL_DEBUG("unionfs_readlink: leave (%d)\n", error); return (error); } static int unionfs_getwritemount(struct vop_getwritemount_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *vp, *ovp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: enter\n"); error = 0; vp = ap->a_vp; uvp = NULLVP; VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp != NULL) uvp = unp->un_uppervp; /* * If our node has no upper vnode, check the parent directory. * We may be initiating a write operation that will produce a * new upper vnode through CoW. */ if (uvp == NULLVP && unp != NULL) { ovp = vp; vp = unp->un_dvp; /* * Only the root vnode should have an empty parent, but it * should not have an empty uppervp, so we shouldn't get here. */ VNASSERT(vp != NULL, ovp, ("%s: NULL parent vnode", __func__)); VI_UNLOCK(ovp); VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp != NULL) uvp = unp->un_uppervp; if (uvp == NULLVP) error = EACCES; } if (uvp != NULLVP) { vholdnz(uvp); VI_UNLOCK(vp); error = VOP_GETWRITEMOUNT(uvp, ap->a_mpp); vdrop(uvp); } else { VI_UNLOCK(vp); *(ap->a_mpp) = NULL; } UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: leave (%d)\n", error); return (error); } static int unionfs_inactive(struct vop_inactive_args *ap) { ap->a_vp->v_object = NULL; vrecycle(ap->a_vp); return (0); } static int unionfs_reclaim(struct vop_reclaim_args *ap) { /* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: enter\n"); */ unionfs_noderem(ap->a_vp); /* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: leave\n"); */ return (0); } static int unionfs_print(struct vop_print_args *ap) { struct unionfs_node *unp; /* struct unionfs_node_status *unsp; */ unp = VTOUNIONFS(ap->a_vp); /* unionfs_get_node_status(unp, curthread, &unsp); */ printf("unionfs_vp=%p, uppervp=%p, lowervp=%p\n", ap->a_vp, unp->un_uppervp, unp->un_lowervp); /* printf("unionfs opencnt: uppervp=%d, lowervp=%d\n", unsp->uns_upper_opencnt, unsp->uns_lower_opencnt); */ if (unp->un_uppervp != NULLVP) vn_printf(unp->un_uppervp, "unionfs: upper "); if (unp->un_lowervp != NULLVP) vn_printf(unp->un_lowervp, "unionfs: lower "); return (0); } static int unionfs_islocked(struct vop_islocked_args *ap) { struct unionfs_node *unp; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); if (unp == NULL) return (vop_stdislocked(ap)); if (unp->un_uppervp != NULLVP) return (VOP_ISLOCKED(unp->un_uppervp)); if (unp->un_lowervp != NULLVP) return (VOP_ISLOCKED(unp->un_lowervp)); return (vop_stdislocked(ap)); } static int unionfs_get_llt_revlock(struct vnode *vp, int flags) { int revlock; revlock = 0; switch (flags & LK_TYPE_MASK) { case LK_SHARED: if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) revlock = LK_UPGRADE; else revlock = LK_RELEASE; break; case LK_EXCLUSIVE: case LK_UPGRADE: revlock = LK_RELEASE; break; case LK_DOWNGRADE: revlock = LK_UPGRADE; break; default: break; } return (revlock); } /* * The state of an acquired lock is adjusted similarly to * the time of error generating. * flags: LK_RELEASE or LK_UPGRADE */ static void unionfs_revlock(struct vnode *vp, int flags) { if (flags & LK_RELEASE) VOP_UNLOCK_FLAGS(vp, flags); else { /* UPGRADE */ if (vn_lock(vp, flags) != 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } } static int unionfs_lock(struct vop_lock1_args *ap) { struct unionfs_node *unp; struct vnode *vp; struct vnode *uvp; struct vnode *lvp; int error; int flags; int revlock; int interlock; int uhold; /* * TODO: rework the unionfs locking scheme. * It's not guaranteed to be safe to blindly lock two vnodes on * different mounts as is done here. Further, the entanglement * of locking both vnodes with the various options that can be * passed to VOP_LOCK() makes this code hard to reason about. * Instead, consider locking only the upper vnode, or the lower * vnode is the upper is not present, and taking separate measures * to lock both vnodes in the few cases when that is needed. */ error = 0; interlock = 1; uhold = 0; flags = ap->a_flags; vp = ap->a_vp; if (LK_RELEASE == (flags & LK_TYPE_MASK) || !(flags & LK_TYPE_MASK)) return (VOP_UNLOCK_FLAGS(vp, flags | LK_RELEASE)); if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp == NULL) goto unionfs_lock_null_vnode; KASSERT_UNIONFS_VNODE(ap->a_vp); lvp = unp->un_lowervp; uvp = unp->un_uppervp; if ((revlock = unionfs_get_llt_revlock(vp, flags)) == 0) panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK); /* * During unmount, the root vnode lock may be taken recursively, * because it may share the same v_vnlock field as the vnode covered by * the unionfs mount. The covered vnode is locked across VFS_UNMOUNT(), * and the same lock may be taken recursively here during vflush() * issued by unionfs_unmount(). */ if ((flags & LK_TYPE_MASK) == LK_EXCLUSIVE && (vp->v_vflag & VV_ROOT) != 0) flags |= LK_CANRECURSE; if (lvp != NULLVP) { if (uvp != NULLVP && flags & LK_UPGRADE) { /* * Share Lock is once released and a deadlock is * avoided. */ vholdnz(uvp); uhold = 1; VOP_UNLOCK(uvp); unp = VTOUNIONFS(vp); if (unp == NULL) { /* vnode is released. */ VI_UNLOCK(vp); VOP_UNLOCK(lvp); vdrop(uvp); return (EBUSY); } } VI_LOCK_FLAGS(lvp, MTX_DUPOK); flags |= LK_INTERLOCK; vholdl(lvp); VI_UNLOCK(vp); ap->a_flags &= ~LK_INTERLOCK; error = VOP_LOCK(lvp, flags); VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp == NULL) { /* vnode is released. */ VI_UNLOCK(vp); if (error == 0) VOP_UNLOCK(lvp); vdrop(lvp); if (uhold != 0) vdrop(uvp); return (vop_stdlock(ap)); } } if (error == 0 && uvp != NULLVP) { if (uhold && flags & LK_UPGRADE) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } VI_LOCK_FLAGS(uvp, MTX_DUPOK); flags |= LK_INTERLOCK; if (uhold == 0) { vholdl(uvp); uhold = 1; } VI_UNLOCK(vp); ap->a_flags &= ~LK_INTERLOCK; error = VOP_LOCK(uvp, flags); VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp == NULL) { /* vnode is released. */ VI_UNLOCK(vp); if (error == 0) VOP_UNLOCK(uvp); vdrop(uvp); if (lvp != NULLVP) { VOP_UNLOCK(lvp); vdrop(lvp); } return (vop_stdlock(ap)); } if (error != 0 && lvp != NULLVP) { /* rollback */ VI_UNLOCK(vp); unionfs_revlock(lvp, revlock); interlock = 0; } } if (interlock) VI_UNLOCK(vp); if (lvp != NULLVP) vdrop(lvp); if (uhold != 0) vdrop(uvp); return (error); unionfs_lock_null_vnode: ap->a_flags |= LK_INTERLOCK; return (vop_stdlock(ap)); } static int unionfs_unlock(struct vop_unlock_args *ap) { struct vnode *vp; struct vnode *lvp; struct vnode *uvp; struct unionfs_node *unp; int error; int uhold; KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; uhold = 0; vp = ap->a_vp; unp = VTOUNIONFS(vp); if (unp == NULL) goto unionfs_unlock_null_vnode; lvp = unp->un_lowervp; uvp = unp->un_uppervp; if (lvp != NULLVP) { vholdnz(lvp); error = VOP_UNLOCK(lvp); } if (error == 0 && uvp != NULLVP) { vholdnz(uvp); uhold = 1; error = VOP_UNLOCK(uvp); } if (lvp != NULLVP) vdrop(lvp); if (uhold != 0) vdrop(uvp); return error; unionfs_unlock_null_vnode: return (vop_stdunlock(ap)); } static int unionfs_pathconf(struct vop_pathconf_args *ap) { struct unionfs_node *unp; struct vnode *vp; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); return (VOP_PATHCONF(vp, ap->a_name, ap->a_retval)); } static int unionfs_advlock(struct vop_advlock_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *vp; struct vnode *uvp; struct thread *td; int error; UNIONFS_INTERNAL_DEBUG("unionfs_advlock: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); vp = ap->a_vp; td = curthread; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; if (uvp == NULLVP) { error = unionfs_copyfile(unp, 1, td->td_ucred, td); if (error != 0) goto unionfs_advlock_abort; uvp = unp->un_uppervp; unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt > 0) { /* try reopen the vnode */ error = VOP_OPEN(uvp, unsp->uns_lower_openmode, td->td_ucred, td, NULL); if (error) goto unionfs_advlock_abort; unsp->uns_upper_opencnt++; VOP_CLOSE(unp->un_lowervp, unsp->uns_lower_openmode, td->td_ucred, td); unsp->uns_lower_opencnt--; } else unionfs_tryrem_node_status(unp, unsp); } VOP_UNLOCK(vp); error = VOP_ADVLOCK(uvp, ap->a_id, ap->a_op, ap->a_fl, ap->a_flags); UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error); return error; unionfs_advlock_abort: VOP_UNLOCK(vp); UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error); return error; } static int unionfs_strategy(struct vop_strategy_args *ap) { struct unionfs_node *unp; struct vnode *vp; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); #ifdef DIAGNOSTIC if (vp == NULLVP) panic("unionfs_strategy: nullvp"); if (ap->a_bp->b_iocmd == BIO_WRITE && vp == unp->un_lowervp) panic("unionfs_strategy: writing to lowervp"); #endif return (VOP_STRATEGY(vp, ap->a_bp)); } static int unionfs_getacl(struct vop_getacl_args *ap) { struct unionfs_node *unp; struct vnode *vp; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); UNIONFS_INTERNAL_DEBUG("unionfs_getacl: enter\n"); error = VOP_GETACL(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_getacl: leave (%d)\n", error); return (error); } static int unionfs_setacl(struct vop_setacl_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; int error; UNIONFS_INTERNAL_DEBUG("unionfs_setacl: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) error = VOP_SETACL(uvp, ap->a_type, ap->a_aclp, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_setacl: leave (%d)\n", error); return (error); } static int unionfs_aclcheck(struct vop_aclcheck_args *ap) { struct unionfs_node *unp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_ACLCHECK(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: leave (%d)\n", error); return (error); } static int unionfs_openextattr(struct vop_openextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; struct vnode *tvp; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); vp = ap->a_vp; unp = VTOUNIONFS(vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); if ((tvp == unp->un_uppervp && (unp->un_flag & UNIONFS_OPENEXTU)) || (tvp == unp->un_lowervp && (unp->un_flag & UNIONFS_OPENEXTL))) return (EBUSY); error = VOP_OPENEXTATTR(tvp, ap->a_cred, ap->a_td); if (error == 0) { if (vn_lock(vp, LK_UPGRADE) != 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (!VN_IS_DOOMED(vp)) { if (tvp == unp->un_uppervp) unp->un_flag |= UNIONFS_OPENEXTU; else unp->un_flag |= UNIONFS_OPENEXTL; } vn_lock(vp, LK_DOWNGRADE | LK_RETRY); } return (error); } static int unionfs_closeextattr(struct vop_closeextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; struct vnode *tvp; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); vp = ap->a_vp; unp = VTOUNIONFS(vp); tvp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) tvp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) tvp = unp->un_lowervp; if (tvp == NULLVP) return (EOPNOTSUPP); error = VOP_CLOSEEXTATTR(tvp, ap->a_commit, ap->a_cred, ap->a_td); if (error == 0) { if (vn_lock(vp, LK_UPGRADE) != 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (!VN_IS_DOOMED(vp)) { if (tvp == unp->un_uppervp) unp->un_flag &= ~UNIONFS_OPENEXTU; else unp->un_flag &= ~UNIONFS_OPENEXTL; } vn_lock(vp, LK_DOWNGRADE | LK_RETRY); } return (error); } static int unionfs_getextattr(struct vop_getextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) vp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) vp = unp->un_lowervp; if (vp == NULLVP) return (EOPNOTSUPP); return (VOP_GETEXTATTR(vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td)); } static int unionfs_setextattr(struct vop_setextattr_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct vnode *ovp; struct ucred *cred; struct thread *td; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; ovp = NULLVP; cred = ap->a_cred; td = ap->a_td; UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: enter (un_flag=%x)\n", unp->un_flag); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (unp->un_flag & UNIONFS_OPENEXTU) ovp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) ovp = unp->un_lowervp; if (ovp == NULLVP) return (EOPNOTSUPP); if (ovp == lvp && lvp->v_type == VREG) { VOP_CLOSEEXTATTR(lvp, 0, cred, td); if (uvp == NULLVP && (error = unionfs_copyfile(unp, 1, cred, td)) != 0) { unionfs_setextattr_reopen: if ((unp->un_flag & UNIONFS_OPENEXTL) && VOP_OPENEXTATTR(lvp, cred, td)) { #ifdef DIAGNOSTIC panic("unionfs: VOP_OPENEXTATTR failed"); #endif unp->un_flag &= ~UNIONFS_OPENEXTL; } goto unionfs_setextattr_abort; } uvp = unp->un_uppervp; if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0) goto unionfs_setextattr_reopen; unp->un_flag &= ~UNIONFS_OPENEXTL; unp->un_flag |= UNIONFS_OPENEXTU; ovp = uvp; } if (ovp == uvp) error = VOP_SETEXTATTR(ovp, ap->a_attrnamespace, ap->a_name, ap->a_uio, cred, td); unionfs_setextattr_abort: UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: leave (%d)\n", error); return (error); } static int unionfs_listextattr(struct vop_listextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) vp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) vp = unp->un_lowervp; if (vp == NULLVP) return (EOPNOTSUPP); return (VOP_LISTEXTATTR(vp, ap->a_attrnamespace, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td)); } static int unionfs_deleteextattr(struct vop_deleteextattr_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct vnode *ovp; struct ucred *cred; struct thread *td; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; ovp = NULLVP; cred = ap->a_cred; td = ap->a_td; UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: enter (un_flag=%x)\n", unp->un_flag); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (unp->un_flag & UNIONFS_OPENEXTU) ovp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) ovp = unp->un_lowervp; if (ovp == NULLVP) return (EOPNOTSUPP); if (ovp == lvp && lvp->v_type == VREG) { VOP_CLOSEEXTATTR(lvp, 0, cred, td); if (uvp == NULLVP && (error = unionfs_copyfile(unp, 1, cred, td)) != 0) { unionfs_deleteextattr_reopen: if ((unp->un_flag & UNIONFS_OPENEXTL) && VOP_OPENEXTATTR(lvp, cred, td)) { #ifdef DIAGNOSTIC panic("unionfs: VOP_OPENEXTATTR failed"); #endif unp->un_flag &= ~UNIONFS_OPENEXTL; } goto unionfs_deleteextattr_abort; } uvp = unp->un_uppervp; if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0) goto unionfs_deleteextattr_reopen; unp->un_flag &= ~UNIONFS_OPENEXTL; unp->un_flag |= UNIONFS_OPENEXTU; ovp = uvp; } if (ovp == uvp) error = VOP_DELETEEXTATTR(ovp, ap->a_attrnamespace, ap->a_name, ap->a_cred, ap->a_td); unionfs_deleteextattr_abort: UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: leave (%d)\n", error); return (error); } static int unionfs_setlabel(struct vop_setlabel_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; int error; UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) error = VOP_SETLABEL(uvp, ap->a_label, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: leave (%d)\n", error); return (error); } static int unionfs_vptofh(struct vop_vptofh_args *ap) { return (EOPNOTSUPP); } static int unionfs_add_writecount(struct vop_add_writecount_args *ap) { struct vnode *tvp, *vp; struct unionfs_node *unp; int error, writerefs __diagused; vp = ap->a_vp; unp = VTOUNIONFS(vp); tvp = unp->un_uppervp; KASSERT(tvp != NULL, ("%s: adding write ref without upper vnode", __func__)); error = VOP_ADD_WRITECOUNT(tvp, ap->a_inc); if (error != 0) return (error); /* * We need to track the write refs we've passed to the underlying * vnodes so that we can undo them in case we are forcibly unmounted. */ writerefs = atomic_fetchadd_int(&vp->v_writecount, ap->a_inc); /* text refs are bypassed to lowervp */ VNASSERT(writerefs >= 0, vp, ("%s: invalid write count %d", __func__, writerefs)); VNASSERT(writerefs + ap->a_inc >= 0, vp, ("%s: invalid write count inc %d + %d", __func__, writerefs, ap->a_inc)); return (0); } static int unionfs_vput_pair(struct vop_vput_pair_args *ap) { struct mount *mp; struct vnode *dvp, *vp, **vpp, *lvp, *ldvp, *uvp, *udvp, *tempvp; struct unionfs_node *dunp, *unp; int error, res; dvp = ap->a_dvp; vpp = ap->a_vpp; vp = NULLVP; lvp = NULLVP; uvp = NULLVP; unp = NULL; dunp = VTOUNIONFS(dvp); udvp = dunp->un_uppervp; ldvp = dunp->un_lowervp; /* * Underlying vnodes should be locked because the encompassing unionfs * node is locked, but will not be referenced, as the reference will * only be on the unionfs node. Reference them now so that the vput()s * performed by VOP_VPUT_PAIR() will have a reference to drop. */ if (udvp != NULLVP) vref(udvp); if (ldvp != NULLVP) vref(ldvp); if (vpp != NULL) vp = *vpp; if (vp != NULLVP) { unp = VTOUNIONFS(vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; if (uvp != NULLVP) vref(uvp); if (lvp != NULLVP) vref(lvp); /* * If we're being asked to return a locked child vnode, then * we may need to create a replacement vnode in case the * original is reclaimed while the lock is dropped. In that * case we'll need to ensure the mount and the underlying * vnodes aren't also recycled during that window. */ if (!ap->a_unlock_vp) { vhold(vp); if (uvp != NULLVP) vhold(uvp); if (lvp != NULLVP) vhold(lvp); mp = vp->v_mount; vfs_ref(mp); } } /* * TODO: Because unionfs_lock() locks both the lower and upper vnodes * (if available), we must also call VOP_VPUT_PAIR() on both the lower * and upper parent/child pairs. If unionfs_lock() is reworked to lock * only a single vnode, this code will need to change to also only * operate on one vnode pair. */ ASSERT_VOP_LOCKED(ldvp, __func__); ASSERT_VOP_LOCKED(udvp, __func__); ASSERT_VOP_LOCKED(lvp, __func__); ASSERT_VOP_LOCKED(uvp, __func__); KASSERT(lvp == NULLVP || ldvp != NULLVP, ("%s: NULL ldvp with non-NULL lvp", __func__)); if (ldvp != NULLVP) res = VOP_VPUT_PAIR(ldvp, lvp != NULLVP ? &lvp : NULL, true); KASSERT(uvp == NULLVP || udvp != NULLVP, ("%s: NULL udvp with non-NULL uvp", __func__)); if (udvp != NULLVP) res = VOP_VPUT_PAIR(udvp, uvp != NULLVP ? &uvp : NULL, true); ASSERT_VOP_UNLOCKED(ldvp, __func__); ASSERT_VOP_UNLOCKED(udvp, __func__); ASSERT_VOP_UNLOCKED(lvp, __func__); ASSERT_VOP_UNLOCKED(uvp, __func__); /* * VOP_VPUT_PAIR() dropped the references we added to the underlying * vnodes, now drop the caller's reference to the unionfs vnodes. */ if (vp != NULLVP && ap->a_unlock_vp) vrele(vp); vrele(dvp); if (vp == NULLVP || ap->a_unlock_vp) return (res); /* * We're being asked to return a locked vnode. At this point, the * underlying vnodes have been unlocked, so vp may have been reclaimed. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_data == NULL && vfs_busy(mp, MBF_NOWAIT) == 0) { vput(vp); error = unionfs_nodeget(mp, uvp, lvp, dvp, &tempvp, NULL); if (error == 0) { vn_lock(tempvp, LK_EXCLUSIVE | LK_RETRY); *vpp = tempvp; } else vget(vp, LK_EXCLUSIVE | LK_RETRY); vfs_unbusy(mp); } if (lvp != NULLVP) vdrop(lvp); if (uvp != NULLVP) vdrop(uvp); vdrop(vp); vfs_rel(mp); return (res); } static int unionfs_set_text(struct vop_set_text_args *ap) { struct vnode *tvp; struct unionfs_node *unp; int error; /* * We assume text refs are managed against lvp/uvp through the * executable mapping backed by its VM object. We therefore don't * need to track leased text refs in the case of a forcible unmount. */ unp = VTOUNIONFS(ap->a_vp); ASSERT_VOP_LOCKED(ap->a_vp, __func__); tvp = unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp; error = VOP_SET_TEXT(tvp); return (error); } static int unionfs_unset_text(struct vop_unset_text_args *ap) { struct vnode *tvp; struct unionfs_node *unp; ASSERT_VOP_LOCKED(ap->a_vp, __func__); unp = VTOUNIONFS(ap->a_vp); tvp = unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp; VOP_UNSET_TEXT_CHECKED(tvp); return (0); } struct vop_vector unionfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = unionfs_access, .vop_aclcheck = unionfs_aclcheck, .vop_advlock = unionfs_advlock, .vop_bmap = VOP_EOPNOTSUPP, .vop_cachedlookup = unionfs_lookup, .vop_close = unionfs_close, .vop_closeextattr = unionfs_closeextattr, .vop_create = unionfs_create, .vop_deleteextattr = unionfs_deleteextattr, .vop_fsync = unionfs_fsync, .vop_getacl = unionfs_getacl, .vop_getattr = unionfs_getattr, .vop_getextattr = unionfs_getextattr, .vop_getwritemount = unionfs_getwritemount, .vop_inactive = unionfs_inactive, .vop_need_inactive = vop_stdneed_inactive, .vop_islocked = unionfs_islocked, .vop_ioctl = unionfs_ioctl, .vop_link = unionfs_link, .vop_listextattr = unionfs_listextattr, .vop_lock1 = unionfs_lock, .vop_lookup = vfs_cache_lookup, .vop_mkdir = unionfs_mkdir, .vop_mknod = unionfs_mknod, .vop_open = unionfs_open, .vop_openextattr = unionfs_openextattr, .vop_pathconf = unionfs_pathconf, .vop_poll = unionfs_poll, .vop_print = unionfs_print, .vop_read = unionfs_read, .vop_readdir = unionfs_readdir, .vop_readlink = unionfs_readlink, .vop_reclaim = unionfs_reclaim, .vop_remove = unionfs_remove, .vop_rename = unionfs_rename, .vop_rmdir = unionfs_rmdir, .vop_setacl = unionfs_setacl, .vop_setattr = unionfs_setattr, .vop_setextattr = unionfs_setextattr, .vop_setlabel = unionfs_setlabel, .vop_strategy = unionfs_strategy, .vop_symlink = unionfs_symlink, .vop_unlock = unionfs_unlock, .vop_whiteout = unionfs_whiteout, .vop_write = unionfs_write, .vop_vptofh = unionfs_vptofh, .vop_add_writecount = unionfs_add_writecount, .vop_vput_pair = unionfs_vput_pair, .vop_set_text = unionfs_set_text, .vop_unset_text = unionfs_unset_text, }; VFS_VOP_VECTOR_REGISTER(unionfs_vnodeops); diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 364c10987ea6..cb45d18fbb85 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1,2086 +1,2086 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exec, "char *"); SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", "Location of process' ps_strings structure"); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", "Top of process stack"); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_stackprot, "I", "Stack memory permissions"); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, "Process' command line characters cache limit"); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); static int core_dump_can_intr = 1; SYSCTL_INT(_kern, OID_AUTO, core_dump_can_intr, CTLFLAG_RWTUN, &core_dump_can_intr, 0, "Core dumping interruptible with SIGKILL"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; vm_offset_t ps_strings; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)PROC_PS_STRINGS(p); return (SYSCTL_OUT(req, &val, sizeof(val))); } #endif ps_strings = PROC_PS_STRINGS(p); return (SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings))); } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; vm_offset_t val; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val32; val32 = round_page((unsigned int)p->p_vmspace->vm_stacktop); return (SYSCTL_OUT(req, &val32, sizeof(val32))); } #endif val = round_page(p->p_vmspace->vm_stacktop); return (SYSCTL_OUT(req, &val, sizeof(val))); } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL, oldvmspace); post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; }; #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL, oldvmspace); } post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p, oldvmspace); post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == EJUSTRETURN) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } exec_cleanup(td, oldvmspace); } /* * kern_execve() has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace) { TSEXEC(td->td_proc->p_pid, args->begin_argv); AUDIT_ARG_ARGV(args->begin_argv, args->argc, exec_args_get_begin_envv(args) - args->begin_argv); AUDIT_ARG_ENVV(exec_args_get_begin_envv(args), args->envc, args->endp - exec_args_get_begin_envv(args)); /* Must have at least one argument. */ if (args->argc == 0) { exec_free_args(args); return (EINVAL); } return (do_execve(td, args, mac_p, oldvmspace)); } static void execve_nosetid(struct image_params *imgp) { imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace) { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; uintptr_t stack_base; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct ktr_io_params *kiop; #endif struct vnode *oldtextvp, *newtextvp; struct vnode *oldtextdvp, *newtextdvp; char *oldbinname, *newbinname; bool credential_changing; #ifdef MAC struct label *interpvplabel = NULL; bool will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif int error, i, orig_osrel; uint32_t orig_fctl0; Elf_Brandinfo *orig_brandinfo; size_t freepath_size; static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; oldtextvp = oldtextdvp = NULL; newtextvp = newtextdvp = NULL; newbinname = oldbinname = NULL; #ifdef KTRACE kiop = NULL; #endif /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; orig_osrel = p->p_osrel; orig_fctl0 = p->p_fctl0; orig_brandinfo = p->p_elf_brandinfo; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif /* * Translate the file name. namei() returns a vnode * pointer in ni_vp among other things. */ NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW | - SAVENAME | AUDITVNODE1 | WANTPARENT, UIO_SYSSPACE, + AUDITVNODE1 | WANTPARENT, UIO_SYSSPACE, args->fname); error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; newtextdvp = nd.ni_dvp; nd.ni_dvp = NULL; newbinname = malloc(nd.ni_cnd.cn_namelen + 1, M_PARGS, M_WAITOK); memcpy(newbinname, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen); newbinname[nd.ni_cnd.cn_namelen] = '\0'; imgp->vp = newtextvp; /* * Do the best to calculate the full path to the image file. */ if (args->fname[0] == '/') { imgp->execpath = args->fname; } else { VOP_UNLOCK(imgp->vp); freepath_size = MAXPATHLEN; if (vn_fullpath_hardlink(newtextvp, newtextdvp, newbinname, nd.ni_cnd.cn_namelen, &imgp->execpath, &imgp->freepath, &freepath_size) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } } else { AUDIT_ARG_FD(args->fd); /* * If the descriptors was not opened with O_PATH, then * we require that it was opened with O_EXEC or * O_RDONLY. In either case, exec_check_permissions() * below checks _current_ file access mode regardless * of the permissions additionally checked at the * open(2). */ error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp); if (error != 0) goto exec_fail; if (vn_fullpath(newtextvp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(newtextvp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions. Also 'opens' file and sets its vnode to * text mode. */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; imgp->proc->p_fctl0 = 0; imgp->proc->p_elf_brandinfo = NULL; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = false; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp) != 0; credential_changing |= will_transition; #endif /* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */ if (credential_changing) imgp->proc->p_pdeathsig = 0; if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) error = ENOEXEC; goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * The text reference needs to be removed for scripts. * There is a short period before we determine that * something is a script where text reference is active. * The vnode lock is held over this entire period * so nothing should illegitimately be blocked. */ MPASS(imgp->textset); VOP_UNSET_TEXT_CHECKED(newtextvp); imgp->textset = false; /* free name buffer and old vnode */ #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = false; } vput(newtextvp); imgp->vp = newtextvp = NULL; if (args->fname != NULL) { if (newtextdvp != NULL) { vrele(newtextdvp); newtextdvp = NULL; } NDFREE_PNBUF(&nd); free(newbinname, M_PARGS); newbinname = NULL; } vm_object_deallocate(imgp->object); imgp->object = NULL; execve_nosetid(imgp); imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * Copy out strings (args and env) and initialize stack base. */ error = (*p->p_sysent->sv_copyout_strings)(imgp, &stack_base); if (error != 0) { vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * Stack setup. */ error = (*p->p_sysent->sv_fixup)(&stack_base, imgp); if (error != 0) { vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * For security and other reasons, the file descriptor table cannot be * shared after an exec. */ fdunshare(td); pdunshare(td); /* close files on exec */ fdcloseexec(td); /* * Malloc things before we need locks. */ i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0) p->p_flag2 &= ~P2_STKGAP_DISABLE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } if ((imgp->sysent->sv_setid_allowed != NULL && !(*imgp->sysent->sv_setid_allowed)(td, imgp)) || (p->p_flag2 & P2_NO_NEW_PRIVS) != 0) execve_nosetid(imgp); /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE kiop = ktrprocexec(p); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in kern.proc.pathname. This vnode was * referenced by namei() or by fexecve variant of fname handling. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; oldtextdvp = p->p_textdvp; p->p_textdvp = newtextdvp; newtextdvp = NULL; oldbinname = p->p_binname; p->p_binname = newbinname; newbinname = NULL; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; PROC_UNLOCK(p); #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { VOP_UNLOCK(imgp->vp); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } #endif /* Set values passed into the program in registers. */ (*p->p_sysent->sv_setregs)(td, imgp, stack_base); VOP_MMAPPED(imgp->vp); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (error != 0) { p->p_osrel = orig_osrel; p->p_fctl0 = orig_fctl0; p->p_elf_brandinfo = orig_brandinfo; } if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (imgp->textset) VOP_UNSET_TEXT_CHECKED(imgp->vp); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp); if (args->fname != NULL) NDFREE_PNBUF(&nd); if (newtextdvp != NULL) vrele(newtextdvp); free(newbinname, M_PARGS); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { if (p->p_ptevents & PTRACE_EXEC) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); } } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); if (oldtextdvp != NULL) vrele(oldtextdvp); free(oldbinname, M_PARGS); #ifdef KTRACE ktr_io_params_free(kiop); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exec_cleanup(td, oldvmspace); exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif /* * We don't want cpu_set_syscall_retval() to overwrite any of * the register values put in place by exec_setregs(). * Implementations of cpu_set_syscall_retval() will leave * registers unmodified when returning EJUSTRETURN. */ return (error == 0 ? EJUSTRETURN : error); } void exec_cleanup(struct thread *td, struct vmspace *oldvmspace) { if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(td->td_proc->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } int exec_map_first_page(struct image_params *imgp) { vm_object_t object; vm_page_t m; int error; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); #if VM_NRESERVLEVEL > 0 if ((object->flags & OBJ_COLORED) == 0) { VM_OBJECT_WLOCK(object); vm_object_color(object, 0); VM_OBJECT_WUNLOCK(object); } #endif error = vm_page_grab_valid_unlocked(&m, object, 0, VM_ALLOC_COUNT(VM_INITIAL_PAGEIN) | VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); if (error != VM_PAGER_OK) return (EIO); imgp->firstpage = sf_buf_alloc(m, 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(struct image_params *imgp) { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_unwire(m, PQ_ACTIVE); } } void exec_onexec_old(struct thread *td) { sigfastblock_clear(td); umtx_exec(td->td_proc); } /* * This is an optimization which removes the unmanaged shared page * mapping. In combination with pmap_remove_pages(), which cleans all * managed mappings in the process' vmspace pmap, no work will be left * for pmap_remove(min, max). */ void exec_free_abi_mappings(struct proc *p) { struct vmspace *vmspace; vmspace = p->p_vmspace; if (refcount_load(&vmspace->vm_refcnt) != 1) return; if (!PROC_HAS_SHP(p)) return; pmap_remove(vmspace_pmap(vmspace), vmspace->vm_shp_base, vmspace->vm_shp_base + p->p_sysent->sv_shared_page_len); } /* * Run down the current address space and install a new one. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; struct thread *td = curthread; vm_offset_t sv_minuser; vm_map_t map; imgp->vmspace_destroyed = true; imgp->sysent = sv; if (p->p_sysent->sv_onexec_old != NULL) p->p_sysent->sv_onexec_old(td); itimers_exec(p); EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (refcount_load(&vmspace->vm_refcnt) == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser && cpu_exec_vmspace_reuse(p, map)) { exec_free_abi_mappings(p); shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* * An exec terminates mlockall(MCL_FUTURE). * ASLR and W^X states must be re-evaluated. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR | MAP_ASLR_IGNSTART | MAP_ASLR_STACK | MAP_WXORX); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } map->flags |= imgp->map_flags; return (sv->sv_onexec != NULL ? sv->sv_onexec(p, imgp) : 0); } /* * Compute the stack size limit and map the main process stack. * Map the shared page. */ int exec_map_stack(struct image_params *imgp) { struct rlimit rlim_stack; struct sysentvec *sv; struct proc *p; vm_map_t map; struct vmspace *vmspace; vm_offset_t stack_addr, stack_top; vm_offset_t sharedpage_addr; u_long ssiz; int error, find_space, stack_off; vm_prot_t stack_prot; vm_object_t obj; p = imgp->proc; sv = p->p_sysent; if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } vmspace = p->p_vmspace; map = &vmspace->vm_map; stack_prot = sv->sv_shared_page_obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot; if ((map->flags & MAP_ASLR_STACK) != 0) { stack_addr = round_page((vm_offset_t)p->p_vmspace->vm_daddr + lim_max(curthread, RLIMIT_DATA)); find_space = VMFS_ANY_SPACE; } else { stack_addr = sv->sv_usrstack - ssiz; find_space = VMFS_NO_SPACE; } error = vm_map_find(map, NULL, 0, &stack_addr, (vm_size_t)ssiz, sv->sv_usrstack, find_space, stack_prot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error != KERN_SUCCESS) { uprintf("exec_new_vmspace: mapping stack size %#jx prot %#x " "failed, mach error %d errno %d\n", (uintmax_t)ssiz, stack_prot, error, vm_mmap_to_errno(error)); return (vm_mmap_to_errno(error)); } stack_top = stack_addr + ssiz; if ((map->flags & MAP_ASLR_STACK) != 0) { /* Randomize within the first page of the stack. */ arc4rand(&stack_off, sizeof(stack_off), 0); stack_top -= rounddown2(stack_off & PAGE_MASK, sizeof(void *)); } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj == NULL) { sharedpage_addr = 0; goto out; } /* * If randomization is disabled then the shared page will * be mapped at address specified in sysentvec. * Otherwise any address above .data section can be selected. * Same logic is used for stack address randomization. * If the address randomization is applied map a guard page * at the top of UVA. */ vm_object_reference(obj); if ((imgp->imgp_flags & IMGP_ASLR_SHARED_PAGE) != 0) { sharedpage_addr = round_page((vm_offset_t)p->p_vmspace->vm_daddr + lim_max(curthread, RLIMIT_DATA)); error = vm_map_fixed(map, NULL, 0, sv->sv_maxuser - PAGE_SIZE, PAGE_SIZE, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD); if (error != KERN_SUCCESS) { /* * This is not fatal, so let's just print a warning * and continue. */ uprintf("%s: Mapping guard page at the top of UVA failed" " mach error %d errno %d", __func__, error, vm_mmap_to_errno(error)); } error = vm_map_find(map, obj, 0, &sharedpage_addr, sv->sv_shared_page_len, sv->sv_maxuser, VMFS_ANY_SPACE, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); } else { sharedpage_addr = sv->sv_shared_page_base; vm_map_fixed(map, obj, 0, sharedpage_addr, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); } if (error != KERN_SUCCESS) { uprintf("%s: mapping shared page at addr: %p" "failed, mach error %d errno %d\n", __func__, (void *)sharedpage_addr, error, vm_mmap_to_errno(error)); vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } out: /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_maxsaddr = (char *)stack_addr; vmspace->vm_stacktop = stack_top; vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_shp_base = sharedpage_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, const char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long arg, env; int error; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ error = exec_args_add_fname(args, fname, segflg); if (error != 0) goto err_exit; /* * extract arguments first */ for (;;) { error = fueword(argv++, &arg); if (error == -1) { error = EFAULT; goto err_exit; } if (arg == 0) break; error = exec_args_add_arg(args, (char *)(uintptr_t)arg, UIO_USERSPACE); if (error != 0) goto err_exit; } /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &env); if (error == -1) { error = EFAULT; goto err_exit; } if (env == 0) break; error = exec_args_add_env(args, (char *)(uintptr_t)env, UIO_USERSPACE); if (error != 0) goto err_exit; } } return (0); err_exit: exec_free_args(args); return (error); } struct exec_args_kva { vm_offset_t addr; u_int gen; SLIST_ENTRY(exec_args_kva) next; }; DPCPU_DEFINE_STATIC(struct exec_args_kva *, exec_args_kva); static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist; static struct mtx exec_args_kva_mtx; static u_int exec_args_gen; static void exec_prealloc_args_kva(void *arg __unused) { struct exec_args_kva *argkva; u_int i; SLIST_INIT(&exec_args_kva_freelist); mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF); for (i = 0; i < exec_map_entries; i++) { argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK); argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size); argkva->gen = exec_args_gen; SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); } } SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL); static vm_offset_t exec_alloc_args_kva(void **cookie) { struct exec_args_kva *argkva; argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_PTR(exec_args_kva)); if (argkva == NULL) { mtx_lock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL) (void)mtx_sleep(&exec_args_kva_freelist, &exec_args_kva_mtx, 0, "execkva", 0); SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next); mtx_unlock(&exec_args_kva_mtx); } kasan_mark((void *)argkva->addr, exec_map_entry_size, exec_map_entry_size, 0); *(struct exec_args_kva **)cookie = argkva; return (argkva->addr); } static void exec_release_args_kva(struct exec_args_kva *argkva, u_int gen) { vm_offset_t base; base = argkva->addr; kasan_mark((void *)argkva->addr, 0, exec_map_entry_size, KASAN_EXEC_ARGS_FREED); if (argkva->gen != gen) { (void)vm_map_madvise(exec_map, base, base + exec_map_entry_size, MADV_FREE); argkva->gen = gen; } if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva), (uintptr_t)NULL, (uintptr_t)argkva)) { mtx_lock(&exec_args_kva_mtx); SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); wakeup_one(&exec_args_kva_freelist); mtx_unlock(&exec_args_kva_mtx); } } static void exec_free_args_kva(void *cookie) { exec_release_args_kva(cookie, exec_args_gen); } static void exec_args_kva_lowmem(void *arg __unused) { SLIST_HEAD(, exec_args_kva) head; struct exec_args_kva *argkva; u_int gen; int i; gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1; /* * Force an madvise of each KVA range. Any currently allocated ranges * will have MADV_FREE applied once they are freed. */ SLIST_INIT(&head); mtx_lock(&exec_args_kva_mtx); SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva); mtx_unlock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&head)) != NULL) { SLIST_REMOVE_HEAD(&head, next); exec_release_args_kva(argkva, gen); } CPU_FOREACH(i) { argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva)); if (argkva != NULL) exec_release_args_kva(argkva, gen); } } EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL, EVENTHANDLER_PRI_ANY); /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)exec_alloc_args_kva(&args->bufkva); return (0); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { exec_free_args_kva(args->bufkva); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } } /* * A set to functions to fill struct image args. * * NOTE: exec_args_add_fname() must be called (possibly with a NULL * fname) before the other functions. All exec_args_add_arg() calls must * be made before any exec_args_add_env() calls. exec_args_adjust_args() * may be called any time after exec_args_add_fname(). * * exec_args_add_fname() - install path to be executed * exec_args_add_arg() - append an argument string * exec_args_add_env() - append an env string * exec_args_adjust_args() - adjust location of the argument list to * allow new arguments to be prepended */ int exec_args_add_fname(struct image_args *args, const char *fname, enum uio_seg segflg) { int error; size_t length; KASSERT(args->fname == NULL, ("fname already appended")); KASSERT(args->endp == NULL, ("already appending to args")); if (fname != NULL) { args->fname = args->buf; error = segflg == UIO_SYSSPACE ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) return (error == ENAMETOOLONG ? E2BIG : error); } else length = 0; /* Set up for _arg_*()/_env_*() */ args->endp = args->buf + length; /* begin_argv must be set and kept updated */ args->begin_argv = args->endp; KASSERT(exec_map_entry_size - length >= ARG_MAX, ("too little space remaining for arguments %zu < %zu", exec_map_entry_size - length, (size_t)ARG_MAX)); args->stringspace = ARG_MAX; return (0); } static int exec_args_add_str(struct image_args *args, const char *str, enum uio_seg segflg, int *countp) { int error; size_t length; KASSERT(args->endp != NULL, ("endp not initialized")); KASSERT(args->begin_argv != NULL, ("begin_argp not initialized")); error = (segflg == UIO_SYSSPACE) ? copystr(str, args->endp, args->stringspace, &length) : copyinstr(str, args->endp, args->stringspace, &length); if (error != 0) return (error == ENAMETOOLONG ? E2BIG : error); args->stringspace -= length; args->endp += length; (*countp)++; return (0); } int exec_args_add_arg(struct image_args *args, const char *argp, enum uio_seg segflg) { KASSERT(args->envc == 0, ("appending args after env")); return (exec_args_add_str(args, argp, segflg, &args->argc)); } int exec_args_add_env(struct image_args *args, const char *envp, enum uio_seg segflg) { if (args->envc == 0) args->begin_envv = args->endp; return (exec_args_add_str(args, envp, segflg, &args->envc)); } int exec_args_adjust_args(struct image_args *args, size_t consume, ssize_t extend) { ssize_t offset; KASSERT(args->endp != NULL, ("endp not initialized")); KASSERT(args->begin_argv != NULL, ("begin_argp not initialized")); offset = extend - consume; if (args->stringspace < offset) return (E2BIG); memmove(args->begin_argv + extend, args->begin_argv + consume, args->endp - args->begin_argv + consume); if (args->envc > 0) args->begin_envv += offset; args->endp += offset; args->stringspace -= offset; return (0); } char * exec_args_get_begin_envv(struct image_args *args) { KASSERT(args->endp != NULL, ("endp not initialized")); if (args->envc > 0) return (args->begin_envv); return (args->endp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ int exec_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc; char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; struct proc *p; struct sysentvec *sysent; size_t execpath_len; int error, szsigcode; char canary[sizeof(long) * 8]; p = imgp->proc; sysent = p->p_sysent; destp = PROC_PS_STRINGS(p); arginfo = imgp->ps_strings = (void *)destp; /* * Install sigcode. */ if (sysent->sv_shared_page_base == 0 && sysent->sv_szsigcode != NULL) { szsigcode = *(sysent->sv_szsigcode); destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); error = copyout(sysent->sv_sigcode, (void *)destp, szsigcode); if (error != 0) return (error); } /* * Copy the image path for the rtld. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) { execpath_len = strlen(imgp->execpath) + 1; destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ imgp->pagesizeslen = sizeof(pagesizes[0]) * MAXPAGESIZES; destp -= imgp->pagesizeslen; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = (void *)destp; error = copyout(pagesizes, imgp->pagesizes, imgp->pagesizeslen); if (error != 0) return (error); /* * Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has up to AT_COUNT entries. */ destp -= AT_COUNT * sizeof(Elf_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* * vectp also becomes our initial stack base */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* * Fill in "ps_strings" struct for ps, w, etc. */ imgp->argv = vectp; if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* a null vector table pointer separates the argp's from the envp's */ if (suword(vectp++, 0) != 0) return (EFAULT); imgp->envv = vectp; if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* end of vector table is a null pointer */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(struct image_params *imgp) { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. * * Add a text reference now so no one can write to the * executable while we're activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ error = VOP_SET_TEXT(vp); if (error != 0) return (error); imgp->textset = true; /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = true; return (error); } /* * Exec handler registration */ int exec_register(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; u_int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } /* * Write out a core segment to the compression stream. */ static int compress_chunk(struct coredump_params *cp, char *base, char *buf, size_t len) { size_t chunk_len; int error; while (len > 0) { chunk_len = MIN(len, CORE_BUF_SIZE); /* * We can get EFAULT error here. * In that case zero out the current chunk of the segment. */ error = copyin(base, buf, chunk_len); if (error != 0) bzero(buf, chunk_len); error = compressor_write(cp->comp, buf, chunk_len); if (error != 0) break; base += chunk_len; len -= chunk_len; } return (error); } int core_write(struct coredump_params *cp, const void *base, size_t len, off_t offset, enum uio_seg seg, size_t *resid) { return (vn_rdwr_inchunks(UIO_WRITE, cp->vp, __DECONST(void *, base), len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED, cp->active_cred, cp->file_cred, resid, cp->td)); } int core_output(char *base, size_t len, off_t offset, struct coredump_params *cp, void *tmpbuf) { vm_map_t map; struct mount *mp; size_t resid, runlen; int error; bool success; KASSERT((uintptr_t)base % PAGE_SIZE == 0, ("%s: user address %p is not page-aligned", __func__, base)); if (cp->comp != NULL) return (compress_chunk(cp, base, tmpbuf, len)); map = &cp->td->td_proc->p_vmspace->vm_map; for (; len > 0; base += runlen, offset += runlen, len -= runlen) { /* * Attempt to page in all virtual pages in the range. If a * virtual page is not backed by the pager, it is represented as * a hole in the file. This can occur with zero-filled * anonymous memory or truncated files, for example. */ for (runlen = 0; runlen < len; runlen += PAGE_SIZE) { if (core_dump_can_intr && curproc_sigkilled()) return (EINTR); error = vm_fault(map, (uintptr_t)base + runlen, VM_PROT_READ, VM_FAULT_NOFILL, NULL); if (runlen == 0) success = error == KERN_SUCCESS; else if ((error == KERN_SUCCESS) != success) break; } if (success) { error = core_write(cp, base, runlen, offset, UIO_USERSPACE, &resid); if (error != 0) { if (error != EFAULT) break; /* * EFAULT may be returned if the user mapping * could not be accessed, e.g., because a mapped * file has been truncated. Skip the page if no * progress was made, to protect against a * hypothetical scenario where vm_fault() was * successful but core_write() returns EFAULT * anyway. */ runlen -= resid; if (runlen == 0) { success = false; runlen = PAGE_SIZE; } } } if (!success) { error = vn_start_write(cp->vp, &mp, V_WAIT); if (error != 0) break; vn_lock(cp->vp, LK_EXCLUSIVE | LK_RETRY); error = vn_truncate_locked(cp->vp, offset + runlen, false, cp->td->td_ucred); VOP_UNLOCK(cp->vp); vn_finished_write(mp); if (error != 0) break; } } return (error); } /* * Drain into a core file. */ int sbuf_drain_core_output(void *arg, const char *data, int len) { struct coredump_params *cp; struct proc *p; int error, locked; cp = arg; p = cp->td->td_proc; /* * Some kern_proc out routines that print to this sbuf may * call us with the process lock held. Draining with the * non-sleepable lock held is unsafe. The lock is needed for * those routines when dumping a live process. In our case we * can safely release the lock before draining and acquire * again after. */ locked = PROC_LOCKED(p); if (locked) PROC_UNLOCK(p); if (cp->comp != NULL) error = compressor_write(cp->comp, __DECONST(char *, data), len); else error = core_write(cp, __DECONST(void *, data), len, cp->offset, UIO_SYSSPACE, NULL); if (locked) PROC_LOCK(p); if (error != 0) return (-error); cp->offset += len; return (len); } diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c index 745c1174638e..29c7bcdd4289 100644 --- a/sys/kern/uipc_mqueue.c +++ b/sys/kern/uipc_mqueue.c @@ -1,2944 +1,2939 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 David Xu * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support"); /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; const void *mn_pr_root; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; uint32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static struct fileops mqueueops; static unsigned mqfs_osd_jail_slot; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); static int mqfs_prison_remove(void *obj, void *data); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); struct filterops mq_rfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqread, }; struct filterops mq_wfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqwrite, }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_add_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { struct mqfs_info *mqfs; int old, exp; mqfs = node->mn_info; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) { int locked = sx_xlocked(&mqfs->mi_lock); if (!locked) sx_xlock(&mqfs->mi_lock); mqfs_destroy(node); if (!locked) sx_xunlock(&mqfs->mi_lock); } } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_pr_root = cred->cr_prison->pr_root; node->mn_type = nodetype; node->mn_refcount = 1; vfs_timestamp(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, curthread); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_REMOVE] = mqfs_prison_remove, }; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); mqfs_osd_jail_slot = osd_jail_register(NULL, methods); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); osd_jail_deregister(mqfs_osd_jail_slot); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrecycle(vp); VOP_UNLOCK(vp); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; struct mqfs_info *mqfs; struct vnode *newvpp; int error; mqfs = pn->mn_info; *vpp = NULL; sx_xlock(&mqfs->mi_lock); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); break; } } if (vd != NULL) { found: *vpp = vd->mv_vnode; sx_xunlock(&mqfs->mi_lock); error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE); vdrop(*vpp); return (error); } sx_xunlock(&mqfs->mi_lock); error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp); if (error) return (error); vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(newvpp, mp); if (error != 0) return (error); sx_xlock(&mqfs->mi_lock); /* * Check if it has already been allocated * while we were blocked. */ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); sx_xunlock(&mqfs->mi_lock); vgone(newvpp); vput(newvpp); goto found; } } *vpp = newvpp; vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } sx_xunlock(&mqfs->mi_lock); return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred) { struct mqfs_node *pn; const void *pr_root; sx_assert(&pd->mn_info->mi_lock, SX_LOCKED); pr_root = cred->cr_prison->pr_root; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { /* Only match names within the same prison root directory */ if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) && strncmp(pn->mn_name, name, len) == 0 && pn->mn_name[len] == '\0') return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqfs_info *mqfs; int nameiop, flags, error, namelen; char *pname; struct thread *td; td = curthread; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; mqfs = pd->mn_info; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* named node */ sx_xlock(&mqfs->mi_lock); pn = mqfs_search(pd, pname, namelen, cnp->cn_cred); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) { mqnode_release(pn); return (error); } if (*vpp == dvp) { VREF(dvp); *vpp = dvp; mqnode_release(pn); return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); mqnode_release(pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); - cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { int rc; rc = mqfs_lookupx(ap); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); - if ((cnp->cn_flags & HASBUF) == 0) - panic("%s: no name", __func__); pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) { sx_xunlock(&mqfs->mi_lock); error = ENOSPC; } else { mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); uma_zfree(mvdata_zone, vd); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; struct file *a_fp; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_accmode, ap->a_cred); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = NODEV; vap->va_bytes = 0; vap->va_filerev = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; td = curthread; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != pn->mn_uid) && (error = priv_check(td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; uint64_t **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; const void *pr_root; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; pr_root = ap->a_cred->cr_prison->pr_root; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); /* * Only show names within the same prison root directory * (or not associated with a prison, e.g. "." and ".."). */ if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root) continue; if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; entry.d_off = offset + entry.d_reclen; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } dirent_terminate(&entry); if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); - if ((cnp->cn_flags & HASBUF) == 0) - panic("%s: no name", __func__); pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); if (pn == NULL) { error = ENOSPC; } else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); } return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * See if this prison root is obsolete, and clean up associated queues if it is. */ static int mqfs_prison_remove(void *obj, void *data __unused) { const struct prison *pr = obj; struct prison *tpr; struct mqfs_node *pn, *tpn; struct vnode *pr_root; pr_root = pr->pr_root; if (pr->pr_parent->pr_root == pr_root) return (0); TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr != pr && tpr->pr_root == pr_root) return (0); } /* * No jails are rooted in this directory anymore, * so no queues should be either. */ sx_xlock(&mqfs_data.mi_lock); LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children, mn_sibling, tpn) { if (pn->mn_pr_root == pr_root) (void)do_unlink(pn, curthread->td_ucred); } sx_xunlock(&mqfs_data.mi_lock); return (0); } /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF); knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex); knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); free(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); seldrain(&mq->mq_rsel); seldrain(&mq->mq_wsel); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; msg = malloc(len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { free(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { free(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { getnanotime(&ts); timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct thread *td; struct proc *p; int error; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; error = sigev_findtd(p, &nt->nt_sigev, &td); if (error) { mq->mq_notifier = NULL; return; } if (!KSI_ONQ(&nt->nt_ksi)) { ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev); tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi); } PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { getnanotime(&ts); timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } static int kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode, const struct mq_attr *attr) { char path[MQFS_NAMELEN + 1]; struct mqfs_node *pn; struct pwddesc *pdp; struct file *fp; struct mqueue *mq; int fd, error, len, cmode; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); pdp = td->td_proc->p_pd; cmode = (((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT); mq = NULL; if ((flags & O_CREAT) != 0 && attr != NULL) { if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg) return (EINVAL); if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize) return (EINVAL); } error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); /* * "." and ".." are magic directories, populated on the fly, and cannot * be opened as queues. */ if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(attr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { accmode_t accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, accmode, td->td_ucred); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(td, fp, fd); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, &mqueueops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to open a message queue. */ int sys_kmq_open(struct thread *td, struct kmq_open_args *uap) { struct mq_attr attr; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } /* * Syscall to unlink a message queue. */ int sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, rightsp, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_event_rights, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_read_rights, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_write_rights, fget_write, fpp, ppn, pmq); } static int kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr, struct mq_attr *oattr) { struct mqueue *mq; struct file *fp; u_int oflag, flag; int error; AUDIT_ARG_FD(mqd); if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0) return (EINVAL); error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); oattr->mq_maxmsg = mq->mq_maxmsg; oattr->mq_msgsize = mq->mq_msgsize; oattr->mq_curmsgs = mq->mq_curmsgs; if (attr != NULL) { do { oflag = flag = fp->f_flag; flag &= ~O_NONBLOCK; flag |= (attr->mq_flags & O_NONBLOCK); } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); } else oflag = fp->f_flag; oattr->mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); return (error); } int sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mq_attr attr, oattr; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error != 0) return (error); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { bzero(oattr.__reserved, sizeof(oattr.__reserved)); error = copyout(&oattr, uap->oattr, sizeof(oattr)); } return (error); } int sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) goto out; abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) goto out; abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } static int kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev) { struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp, *fp2; struct mqueue_notifier *nt, *newnt = NULL; int error; AUDIT_ARG_FD(mqd); if (sigev != NULL) { if (sigev->sigev_notify != SIGEV_SIGNAL && sigev->sigev_notify != SIGEV_THREAD_ID && sigev->sigev_notify != SIGEV_NONE) return (EINVAL); if ((sigev->sigev_notify == SIGEV_SIGNAL || sigev->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(sigev->sigev_signo)) return (EINVAL); } p = td->td_proc; fdp = td->td_proc->p_fd; error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); fp2 = fget_noref(fdp, mqd); if (fp2 == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights(fdp, mqd), &cap_event_rights); if (error) { FILEDESC_SUNLOCK(fdp); goto out; } #endif if (fp2 != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = mqd; notifier_insert(p, nt); } nt->nt_sigev = *sigev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } int sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev, *evp; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct mqueue *mq; #ifdef INVARIANTS struct filedesc *fdp; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); #endif if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_noref(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); sx_xlock(&mqfs_data.mi_lock); st->st_atim = pn->mn_atime; st->st_mtim = pn->mn_mtime; st->st_ctim = pn->mn_ctime; st->st_birthtim = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN, active_cred); if (error != 0) goto out; pn->mn_mode = mode & ACCESSPERMS; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); if (uid == (uid_t)-1) uid = pn->mn_uid; if (gid == (gid_t)-1) gid = pn->mn_gid; if (((uid != pn->mn_uid && uid != active_cred->cr_uid) || (gid != pn->mn_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; pn->mn_uid = uid; pn->mn_gid = gid; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static int mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_MQUEUE; return (0); } static struct fileops mqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close, .fo_chmod = mqf_chmod, .fo_chown = mqf_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = mqf_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; VFS_VOP_VECTOR_REGISTER(mqfs_vnodeops); static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; static struct vfsconf mqueuefs_vfsconf = { .vfc_version = VFS_VERSION, .vfc_name = "mqueuefs", .vfc_vfsops = &mqfs_vfsops, .vfc_typenum = -1, .vfc_flags = VFCF_SYNTHETIC }; static struct syscall_helper_data mq_syscalls[] = { SYSCALL_INIT_HELPER(kmq_open), SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED), SYSCALL_INIT_HELPER(kmq_unlink), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include static void mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } static void mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } int freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap) { struct mq_attr attr; struct mq_attr32 attr32; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error) return (error); mq_attr_from32(&attr32, &attr); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } int freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap) { struct mq_attr attr, oattr; struct mq_attr32 attr32, oattr32; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error != 0) return (error); mq_attr_from32(&attr32, &attr); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { mq_attr_to32(&oattr, &oattr32); bzero(oattr32.__reserved, sizeof(oattr32.__reserved)); error = copyout(&oattr32, uap->oattr, sizeof(oattr32)); } return (error); } int freebsd32_kmq_timedsend(struct thread *td, struct freebsd32_kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) goto out; CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int freebsd32_kmq_timedreceive(struct thread *td, struct freebsd32_kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) goto out; CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap) { struct sigevent ev, *evp; struct sigevent32 ev32; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev32, sizeof(ev32)); if (error != 0) return (error); error = convert_sigevent32(&ev32, &ev); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static struct syscall_helper_data mq32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_kmq_open), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink), SYSCALL_INIT_LAST }; #endif static int mqinit(void) { int error; error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int mqunload(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(mq32_syscalls); #endif syscall_helper_unregister(mq_syscalls); return (0); } static int mq_modload(struct module *module, int cmd, void *arg) { int error = 0; error = vfs_modevent(module, cmd, arg); if (error != 0) return (error); switch (cmd) { case MOD_LOAD: error = mqinit(); if (error != 0) mqunload(); break; case MOD_UNLOAD: error = mqunload(); break; default: break; } return (error); } static moduledata_t mqueuefs_mod = { "mqueuefs", mq_modload, &mqueuefs_vfsconf }; DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE); MODULE_VERSION(mqueuefs, 1); diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 2b78c3e51907..1a4d2d5adc0a 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -1,3565 +1,3565 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All Rights Reserved. * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved. * Copyright (c) 2018 Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 */ /* * UNIX Domain (Local) Sockets * * This is an implementation of UNIX (local) domain sockets. Each socket has * an associated struct unpcb (UNIX protocol control block). Stream sockets * may be connected to 0 or 1 other socket. Datagram sockets may be * connected to 0, 1, or many other sockets. Sockets may be created and * connected in pairs (socketpair(2)), or bound/connected to using the file * system name space. For most purposes, only the receive socket buffer is * used, as sending on one socket delivers directly to the receive socket * buffer of a second socket. * * The implementation is substantially complicated by the fact that * "ancillary data", such as file descriptors or credentials, may be passed * across UNIX domain sockets. The potential for passing UNIX domain sockets * over other UNIX domain sockets requires the implementation of a simple * garbage collector to find and tear down cycles of disconnected sockets. * * TODO: * RDM * rethink name space problems * need a proper out-of-band */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include MALLOC_DECLARE(M_FILECAPS); static struct domain localdomain; static uma_zone_t unp_zone; static unp_gen_t unp_gencnt; /* (l) */ static u_int unp_count; /* (l) Count of local sockets. */ static ino_t unp_ino; /* Prototype for fake inode numbers. */ static int unp_rights; /* (g) File descriptors in flight. */ static struct unp_head unp_shead; /* (l) List of stream sockets. */ static struct unp_head unp_dhead; /* (l) List of datagram sockets. */ static struct unp_head unp_sphead; /* (l) List of seqpacket sockets. */ struct unp_defer { SLIST_ENTRY(unp_defer) ud_link; struct file *ud_fp; }; static SLIST_HEAD(, unp_defer) unp_defers; static int unp_defers_count; static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; /* * Garbage collection of cyclic file descriptor/socket references occurs * asynchronously in a taskqueue context in order to avoid recursion and * reentrance in the UNIX domain socket, file descriptor, and socket layer * code. See unp_gc() for a full description. */ static struct timeout_task unp_gc_task; /* * The close of unix domain sockets attached as SCM_RIGHTS is * postponed to the taskqueue, to avoid arbitrary recursion depth. * The attached sockets might have another sockets attached. */ static struct task unp_defer_task; /* * Both send and receive buffers are allocated PIPSIZ bytes of buffering for * stream sockets, although the total for sender and receiver is actually * only PIPSIZ. * * Datagram sockets really use the sendspace as the maximum datagram size, * and don't really want to reserve the sendspace. Their recvspace should be * large enough for at least one max-size datagram plus address. */ #ifndef PIPSIZ #define PIPSIZ 8192 #endif static u_long unpst_sendspace = PIPSIZ; static u_long unpst_recvspace = PIPSIZ; static u_long unpdg_maxdgram = 2*1024; static u_long unpdg_recvspace = 16*1024; /* support 8KB syslog msgs */ static u_long unpsp_sendspace = PIPSIZ; /* really max datagram size */ static u_long unpsp_recvspace = PIPSIZ; static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Local domain"); static SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SOCK_STREAM"); static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SOCK_DGRAM"); static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SOCK_SEQPACKET"); SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, &unpst_sendspace, 0, "Default stream send space."); SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, &unpst_recvspace, 0, "Default stream receive space."); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, &unpdg_maxdgram, 0, "Maximum datagram size."); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, &unpdg_recvspace, 0, "Default datagram receive space."); SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW, &unpsp_sendspace, 0, "Default seqpacket send space."); SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW, &unpsp_recvspace, 0, "Default seqpacket receive space."); SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "File descriptors in flight."); SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD, &unp_defers_count, 0, "File descriptors deferred to taskqueue for close."); /* * Locking and synchronization: * * Several types of locks exist in the local domain socket implementation: * - a global linkage lock * - a global connection list lock * - the mtxpool lock * - per-unpcb mutexes * * The linkage lock protects the global socket lists, the generation number * counter and garbage collector state. * * The connection list lock protects the list of referring sockets in a datagram * socket PCB. This lock is also overloaded to protect a global list of * sockets whose buffers contain socket references in the form of SCM_RIGHTS * messages. To avoid recursion, such references are released by a dedicated * thread. * * The mtxpool lock protects the vnode from being modified while referenced. * Lock ordering rules require that it be acquired before any PCB locks. * * The unpcb lock (unp_mtx) protects the most commonly referenced fields in the * unpcb. This includes the unp_conn field, which either links two connected * PCBs together (for connected socket types) or points at the destination * socket (for connectionless socket types). The operations of creating or * destroying a connection therefore involve locking multiple PCBs. To avoid * lock order reversals, in some cases this involves dropping a PCB lock and * using a reference counter to maintain liveness. * * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer, * allocated in pr_attach() and freed in pr_detach(). The validity of that * pointer is an invariant, so no lock is required to dereference the so_pcb * pointer if a valid socket reference is held by the caller. In practice, * this is always true during operations performed on a socket. Each unpcb * has a back-pointer to its socket, unp_socket, which will be stable under * the same circumstances. * * This pointer may only be safely dereferenced as long as a valid reference * to the unpcb is held. Typically, this reference will be from the socket, * or from another unpcb when the referring unpcb's lock is held (in order * that the reference not be invalidated during use). For example, to follow * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee * that detach is not run clearing unp_socket. * * Blocking with UNIX domain sockets is a tricky issue: unlike most network * protocols, bind() is a non-atomic operation, and connect() requires * potential sleeping in the protocol, due to potentially waiting on local or * distributed file systems. We try to separate "lookup" operations, which * may sleep, and the IPC operations themselves, which typically can occur * with relative atomicity as locks can be held over the entire operation. * * Another tricky issue is simultaneous multi-threaded or multi-process * access to a single UNIX domain socket. These are handled by the flags * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or * binding, both of which involve dropping UNIX domain socket locks in order * to perform namei() and other file system operations. */ static struct rwlock unp_link_rwlock; static struct mtx unp_defers_lock; #define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \ "unp_link_rwlock") #define UNP_LINK_LOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_LOCKED) #define UNP_LINK_UNLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_UNLOCKED) #define UNP_LINK_RLOCK() rw_rlock(&unp_link_rwlock) #define UNP_LINK_RUNLOCK() rw_runlock(&unp_link_rwlock) #define UNP_LINK_WLOCK() rw_wlock(&unp_link_rwlock) #define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock) #define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_WLOCKED) #define UNP_LINK_WOWNED() rw_wowned(&unp_link_rwlock) #define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \ "unp_defer", NULL, MTX_DEF) #define UNP_DEFERRED_LOCK() mtx_lock(&unp_defers_lock) #define UNP_DEFERRED_UNLOCK() mtx_unlock(&unp_defers_lock) #define UNP_REF_LIST_LOCK() UNP_DEFERRED_LOCK(); #define UNP_REF_LIST_UNLOCK() UNP_DEFERRED_UNLOCK(); #define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \ "unp", "unp", \ MTX_DUPOK|MTX_DEF) #define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx) #define UNP_PCB_LOCKPTR(unp) (&(unp)->unp_mtx) #define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx) #define UNP_PCB_TRYLOCK(unp) mtx_trylock(&(unp)->unp_mtx) #define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx) #define UNP_PCB_OWNED(unp) mtx_owned(&(unp)->unp_mtx) #define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED) #define UNP_PCB_UNLOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED) static int uipc_connect2(struct socket *, struct socket *); static int uipc_ctloutput(struct socket *, struct sockopt *); static int unp_connect(struct socket *, struct sockaddr *, struct thread *); static int unp_connectat(int, struct socket *, struct sockaddr *, struct thread *, bool); typedef enum { PRU_CONNECT, PRU_CONNECT2 } conn2_how; static void unp_connect2(struct socket *so, struct socket *so2, conn2_how); static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2); static void unp_dispose(struct socket *so); static void unp_shutdown(struct unpcb *); static void unp_drop(struct unpcb *); static void unp_gc(__unused void *, int); static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int)); static void unp_discard(struct file *); static void unp_freerights(struct filedescent **, int); static int unp_internalize(struct mbuf **, struct thread *, struct mbuf **, u_int *, u_int *); static void unp_internalize_fp(struct file *); static int unp_externalize(struct mbuf *, struct mbuf **, int); static int unp_externalize_fp(struct file *); static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *, int, struct mbuf **, u_int *, u_int *); static void unp_process_defers(void * __unused, int); static void unp_pcb_hold(struct unpcb *unp) { u_int old __unused; old = refcount_acquire(&unp->unp_refcount); KASSERT(old > 0, ("%s: unpcb %p has no references", __func__, unp)); } static __result_use_check bool unp_pcb_rele(struct unpcb *unp) { bool ret; UNP_PCB_LOCK_ASSERT(unp); if ((ret = refcount_release(&unp->unp_refcount))) { UNP_PCB_UNLOCK(unp); UNP_PCB_LOCK_DESTROY(unp); uma_zfree(unp_zone, unp); } return (ret); } static void unp_pcb_rele_notlast(struct unpcb *unp) { bool ret __unused; ret = refcount_release(&unp->unp_refcount); KASSERT(!ret, ("%s: unpcb %p has no references", __func__, unp)); } static void unp_pcb_lock_pair(struct unpcb *unp, struct unpcb *unp2) { UNP_PCB_UNLOCK_ASSERT(unp); UNP_PCB_UNLOCK_ASSERT(unp2); if (unp == unp2) { UNP_PCB_LOCK(unp); } else if ((uintptr_t)unp2 > (uintptr_t)unp) { UNP_PCB_LOCK(unp); UNP_PCB_LOCK(unp2); } else { UNP_PCB_LOCK(unp2); UNP_PCB_LOCK(unp); } } static void unp_pcb_unlock_pair(struct unpcb *unp, struct unpcb *unp2) { UNP_PCB_UNLOCK(unp); if (unp != unp2) UNP_PCB_UNLOCK(unp2); } /* * Try to lock the connected peer of an already locked socket. In some cases * this requires that we unlock the current socket. The pairbusy counter is * used to block concurrent connection attempts while the lock is dropped. The * caller must be careful to revalidate PCB state. */ static struct unpcb * unp_pcb_lock_peer(struct unpcb *unp) { struct unpcb *unp2; UNP_PCB_LOCK_ASSERT(unp); unp2 = unp->unp_conn; if (unp2 == NULL) return (NULL); if (__predict_false(unp == unp2)) return (unp); UNP_PCB_UNLOCK_ASSERT(unp2); if (__predict_true(UNP_PCB_TRYLOCK(unp2))) return (unp2); if ((uintptr_t)unp2 > (uintptr_t)unp) { UNP_PCB_LOCK(unp2); return (unp2); } unp->unp_pairbusy++; unp_pcb_hold(unp2); UNP_PCB_UNLOCK(unp); UNP_PCB_LOCK(unp2); UNP_PCB_LOCK(unp); KASSERT(unp->unp_conn == unp2 || unp->unp_conn == NULL, ("%s: socket %p was reconnected", __func__, unp)); if (--unp->unp_pairbusy == 0 && (unp->unp_flags & UNP_WAITING) != 0) { unp->unp_flags &= ~UNP_WAITING; wakeup(unp); } if (unp_pcb_rele(unp2)) { /* unp2 is unlocked. */ return (NULL); } if (unp->unp_conn == NULL) { UNP_PCB_UNLOCK(unp2); return (NULL); } return (unp2); } static void uipc_abort(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); UNP_PCB_UNLOCK_ASSERT(unp); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 != NULL) { unp_pcb_hold(unp2); UNP_PCB_UNLOCK(unp); unp_drop(unp2); } else UNP_PCB_UNLOCK(unp); } static int uipc_accept(struct socket *so, struct sockaddr **nam) { struct unpcb *unp, *unp2; const struct sockaddr *sa; /* * Pass back name of connected socket, if it was bound and we are * still connected (our peer may have closed already!). */ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_PCB_LOCK(unp); unp2 = unp_pcb_lock_peer(unp); if (unp2 != NULL && unp2->unp_addr != NULL) sa = (struct sockaddr *)unp2->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); if (unp2 != NULL) unp_pcb_unlock_pair(unp, unp2); else UNP_PCB_UNLOCK(unp); return (0); } static int uipc_attach(struct socket *so, int proto, struct thread *td) { u_long sendspace, recvspace; struct unpcb *unp; int error; bool locked; KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { switch (so->so_type) { case SOCK_STREAM: sendspace = unpst_sendspace; recvspace = unpst_recvspace; break; case SOCK_DGRAM: STAILQ_INIT(&so->so_rcv.uxdg_mb); STAILQ_INIT(&so->so_snd.uxdg_mb); TAILQ_INIT(&so->so_rcv.uxdg_conns); /* * Since send buffer is either bypassed or is a part * of one-to-many receive buffer, we assign both space * limits to unpdg_recvspace. */ sendspace = recvspace = unpdg_recvspace; break; case SOCK_SEQPACKET: sendspace = unpsp_sendspace; recvspace = unpsp_recvspace; break; default: panic("uipc_attach"); } error = soreserve(so, sendspace, recvspace); if (error) return (error); } unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO); if (unp == NULL) return (ENOBUFS); LIST_INIT(&unp->unp_refs); UNP_PCB_LOCK_INIT(unp); unp->unp_socket = so; so->so_pcb = unp; refcount_init(&unp->unp_refcount, 1); if ((locked = UNP_LINK_WOWNED()) == false) UNP_LINK_WLOCK(); unp->unp_gencnt = ++unp_gencnt; unp->unp_ino = ++unp_ino; unp_count++; switch (so->so_type) { case SOCK_STREAM: LIST_INSERT_HEAD(&unp_shead, unp, unp_link); break; case SOCK_DGRAM: LIST_INSERT_HEAD(&unp_dhead, unp, unp_link); break; case SOCK_SEQPACKET: LIST_INSERT_HEAD(&unp_sphead, unp, unp_link); break; default: panic("uipc_attach"); } if (locked == false) UNP_LINK_WUNLOCK(); return (0); } static int uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vattr vattr; int error, namelen; struct nameidata nd; struct unpcb *unp; struct vnode *vp; struct mount *mp; cap_rights_t rights; char *buf; if (nam->sa_family != AF_UNIX) return (EAFNOSUPPORT); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); if (soun->sun_len > sizeof(struct sockaddr_un)) return (EINVAL); namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); if (namelen <= 0) return (EINVAL); /* * We don't allow simultaneous bind() calls on a single UNIX domain * socket, so flag in-progress operations, and return an error if an * operation is already in progress. * * Historically, we have not allowed a socket to be rebound, so this * also returns an error. Not allowing re-binding simplifies the * implementation and avoids a great many possible failure modes. */ UNP_PCB_LOCK(unp); if (unp->unp_vnode != NULL) { UNP_PCB_UNLOCK(unp); return (EINVAL); } if (unp->unp_flags & UNP_BINDING) { UNP_PCB_UNLOCK(unp); return (EALREADY); } unp->unp_flags |= UNP_BINDING; UNP_PCB_UNLOCK(unp); buf = malloc(namelen + 1, M_TEMP, M_WAITOK); bcopy(soun->sun_path, buf, namelen); buf[namelen] = 0; restart: - NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE, + NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | NOCACHE, UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_BINDAT)); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); if (error) goto error; vp = nd.ni_vp; if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE_PNBUF(&nd); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (vp != NULL) { vrele(vp); error = EADDRINUSE; goto error; } error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error) goto error; goto restart; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_pd->pd_cmask); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); NDFREE_PNBUF(&nd); if (error) { VOP_VPUT_PAIR(nd.ni_dvp, NULL, true); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; goto error; } vp = nd.ni_vp; ASSERT_VOP_ELOCKED(vp, "uipc_bind"); soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); UNP_PCB_LOCK(unp); VOP_UNP_BIND(vp, unp); unp->unp_vnode = vp; unp->unp_addr = soun; unp->unp_flags &= ~UNP_BINDING; UNP_PCB_UNLOCK(unp); vref(vp); VOP_VPUT_PAIR(nd.ni_dvp, &vp, true); vn_finished_write(mp); free(buf, M_TEMP); return (0); error: UNP_PCB_LOCK(unp); unp->unp_flags &= ~UNP_BINDING; UNP_PCB_UNLOCK(unp); free(buf, M_TEMP); return (error); } static int uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (uipc_bindat(AT_FDCWD, so, nam, td)); } static int uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; KASSERT(td == curthread, ("uipc_connect: td != curthread")); error = unp_connect(so, nam, td); return (error); } static int uipc_connectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; KASSERT(td == curthread, ("uipc_connectat: td != curthread")); error = unp_connectat(fd, so, nam, td, false); return (error); } static void uipc_close(struct socket *so) { struct unpcb *unp, *unp2; struct vnode *vp = NULL; struct mtx *vplock; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_close: unp == NULL")); vplock = NULL; if ((vp = unp->unp_vnode) != NULL) { vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); } UNP_PCB_LOCK(unp); if (vp && unp->unp_vnode == NULL) { mtx_unlock(vplock); vp = NULL; } if (vp != NULL) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; } if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) unp_disconnect(unp, unp2); else UNP_PCB_UNLOCK(unp); if (vp) { mtx_unlock(vplock); vrele(vp); } } static int uipc_connect2(struct socket *so1, struct socket *so2) { struct unpcb *unp, *unp2; if (so1->so_type != so2->so_type) return (EPROTOTYPE); unp = so1->so_pcb; KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); unp2 = so2->so_pcb; KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL")); unp_pcb_lock_pair(unp, unp2); unp_connect2(so1, so2, PRU_CONNECT2); unp_pcb_unlock_pair(unp, unp2); return (0); } static void uipc_detach(struct socket *so) { struct unpcb *unp, *unp2; struct mtx *vplock; struct vnode *vp; int local_unp_rights; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); vp = NULL; vplock = NULL; UNP_LINK_WLOCK(); LIST_REMOVE(unp, unp_link); if (unp->unp_gcflag & UNPGC_DEAD) LIST_REMOVE(unp, unp_dead); unp->unp_gencnt = ++unp_gencnt; --unp_count; UNP_LINK_WUNLOCK(); UNP_PCB_UNLOCK_ASSERT(unp); restart: if ((vp = unp->unp_vnode) != NULL) { vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); } UNP_PCB_LOCK(unp); if (unp->unp_vnode != vp && unp->unp_vnode != NULL) { if (vplock) mtx_unlock(vplock); UNP_PCB_UNLOCK(unp); goto restart; } if ((vp = unp->unp_vnode) != NULL) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; } if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) unp_disconnect(unp, unp2); else UNP_PCB_UNLOCK(unp); UNP_REF_LIST_LOCK(); while (!LIST_EMPTY(&unp->unp_refs)) { struct unpcb *ref = LIST_FIRST(&unp->unp_refs); unp_pcb_hold(ref); UNP_REF_LIST_UNLOCK(); MPASS(ref != unp); UNP_PCB_UNLOCK_ASSERT(ref); unp_drop(ref); UNP_REF_LIST_LOCK(); } UNP_REF_LIST_UNLOCK(); UNP_PCB_LOCK(unp); local_unp_rights = unp_rights; unp->unp_socket->so_pcb = NULL; unp->unp_socket = NULL; free(unp->unp_addr, M_SONAME); unp->unp_addr = NULL; if (!unp_pcb_rele(unp)) UNP_PCB_UNLOCK(unp); if (vp) { mtx_unlock(vplock); vrele(vp); } if (local_unp_rights) taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1); switch (so->so_type) { case SOCK_DGRAM: /* * Everything should have been unlinked/freed by unp_dispose() * and/or unp_disconnect(). */ MPASS(so->so_rcv.uxdg_peeked == NULL); MPASS(STAILQ_EMPTY(&so->so_rcv.uxdg_mb)); MPASS(TAILQ_EMPTY(&so->so_rcv.uxdg_conns)); MPASS(STAILQ_EMPTY(&so->so_snd.uxdg_mb)); } } static int uipc_disconnect(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); UNP_PCB_LOCK(unp); if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) unp_disconnect(unp, unp2); else UNP_PCB_UNLOCK(unp); return (0); } static int uipc_listen(struct socket *so, int backlog, struct thread *td) { struct unpcb *unp; int error; MPASS(so->so_type != SOCK_DGRAM); /* * Synchronize with concurrent connection attempts. */ error = 0; unp = sotounpcb(so); UNP_PCB_LOCK(unp); if (unp->unp_conn != NULL || (unp->unp_flags & UNP_CONNECTING) != 0) error = EINVAL; else if (unp->unp_vnode == NULL) error = EDESTADDRREQ; if (error != 0) { UNP_PCB_UNLOCK(unp); return (error); } SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0) { cru2xt(td, &unp->unp_peercred); solisten_proto(so, backlog); } SOCK_UNLOCK(so); UNP_PCB_UNLOCK(unp); return (error); } static int uipc_peeraddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp, *unp2; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LINK_RLOCK(); /* * XXX: It seems that this test always fails even when connection is * established. So, this else clause is added as workaround to * return PF_LOCAL sockaddr. */ unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); if (unp2->unp_addr != NULL) sa = (struct sockaddr *) unp2->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp2); } else { sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); } UNP_LINK_RUNLOCK(); return (0); } static int uipc_rcvd(struct socket *so, int flags) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; unp = sotounpcb(so); KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET, ("%s: socktype %d", __func__, so->so_type)); /* * Adjust backpressure on sender and wakeup any waiting to write. * * The unp lock is acquired to maintain the validity of the unp_conn * pointer; no lock on unp2 is required as unp2->unp_socket will be * static as long as we don't permit unp2 to disconnect from unp, * which is prevented by the lock on unp. We cache values from * so_rcv to avoid holding the so_rcv lock over the entire * transaction on the remote so_snd. */ SOCKBUF_LOCK(&so->so_rcv); mbcnt = so->so_rcv.sb_mbcnt; sbcc = sbavail(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); /* * There is a benign race condition at this point. If we're planning to * clear SB_STOP, but uipc_send is called on the connected socket at * this instant, it might add data to the sockbuf and set SB_STOP. Then * we would erroneously clear SB_STOP below, even though the sockbuf is * full. The race is benign because the only ill effect is to allow the * sockbuf to exceed its size limit, and the size limits are not * strictly guaranteed anyway. */ UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 == NULL) { UNP_PCB_UNLOCK(unp); return (0); } so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_snd); if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax) so2->so_snd.sb_flags &= ~SB_STOP; sowwakeup_locked(so2); UNP_PCB_UNLOCK(unp); return (0); } static int uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; int error; unp = sotounpcb(so); KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET, ("%s: socktype %d", __func__, so->so_type)); error = 0; if (flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; } if (control != NULL && (error = unp_internalize(&control, td, NULL, NULL, NULL))) goto release; unp2 = NULL; if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { if ((error = unp_connect(so, nam, td)) != 0) goto out; } else { error = ENOTCONN; goto out; } } UNP_PCB_LOCK(unp); if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) { UNP_PCB_UNLOCK(unp); error = ENOTCONN; goto out; } else if (so->so_snd.sb_state & SBS_CANTSENDMORE) { unp_pcb_unlock_pair(unp, unp2); error = EPIPE; goto out; } UNP_PCB_UNLOCK(unp); if ((so2 = unp2->unp_socket) == NULL) { UNP_PCB_UNLOCK(unp2); error = ENOTCONN; goto out; } SOCKBUF_LOCK(&so2->so_rcv); if (unp2->unp_flags & UNP_WANTCRED_MASK) { /* * Credentials are passed only once on SOCK_STREAM and * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS). */ control = unp_addsockcred(td, control, unp2->unp_flags, NULL, NULL, NULL); unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT; } /* * Send to paired receive port and wake up readers. Don't * check for space available in the receive buffer if we're * attaching ancillary data; Unix domain sockets only check * for space in the sending sockbuf, and that check is * performed one level up the stack. At that level we cannot * precisely account for the amount of buffer space used * (e.g., because control messages are not yet internalized). */ switch (so->so_type) { case SOCK_STREAM: if (control != NULL) { sbappendcontrol_locked(&so2->so_rcv, m, control, flags); control = NULL; } else sbappend_locked(&so2->so_rcv, m, flags); break; case SOCK_SEQPACKET: if (sbappendaddr_nospacecheck_locked(&so2->so_rcv, &sun_noname, m, control)) control = NULL; break; } mbcnt = so2->so_rcv.sb_mbcnt; sbcc = sbavail(&so2->so_rcv); if (sbcc) sorwakeup_locked(so2); else SOCKBUF_UNLOCK(&so2->so_rcv); /* * The PCB lock on unp2 protects the SB_STOP flag. Without it, * it would be possible for uipc_rcvd to be called at this * point, drain the receiving sockbuf, clear SB_STOP, and then * we would set SB_STOP below. That could lead to an empty * sockbuf having SB_STOP set */ SOCKBUF_LOCK(&so->so_snd); if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax) so->so_snd.sb_flags |= SB_STOP; SOCKBUF_UNLOCK(&so->so_snd); UNP_PCB_UNLOCK(unp2); m = NULL; out: /* * PRUS_EOF is equivalent to pr_send followed by pr_shutdown. */ if (flags & PRUS_EOF) { UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); UNP_PCB_UNLOCK(unp); } if (control != NULL && error != 0) unp_scan(control, unp_freerights); release: if (control != NULL) m_freem(control); /* * In case of PRUS_NOTREADY, uipc_ready() is responsible * for freeing memory. */ if (m != NULL && (flags & PRUS_NOTREADY) == 0) m_freem(m); return (error); } /* PF_UNIX/SOCK_DGRAM version of sbspace() */ static inline bool uipc_dgram_sbspace(struct sockbuf *sb, u_int cc, u_int mbcnt) { u_int bleft, mleft; /* * Negative space may happen if send(2) is followed by * setsockopt(SO_SNDBUF/SO_RCVBUF) that shrinks maximum. */ if (__predict_false(sb->sb_hiwat < sb->uxdg_cc || sb->sb_mbmax < sb->uxdg_mbcnt)) return (false); if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) return (false); bleft = sb->sb_hiwat - sb->uxdg_cc; mleft = sb->sb_mbmax - sb->uxdg_mbcnt; return (bleft >= cc && mleft >= mbcnt); } /* * PF_UNIX/SOCK_DGRAM send * * Allocate a record consisting of 3 mbufs in the sequence of * from -> control -> data and append it to the socket buffer. * * The first mbuf carries sender's name and is a pkthdr that stores * overall length of datagram, its memory consumption and control length. */ #define ctllen PH_loc.thirtytwo[1] _Static_assert(offsetof(struct pkthdr, memlen) + sizeof(u_int) <= offsetof(struct pkthdr, ctllen), "unix/dgram can not store ctllen"); static int uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *m, struct mbuf *c, int flags, struct thread *td) { struct unpcb *unp, *unp2; const struct sockaddr *from; struct socket *so2; struct sockbuf *sb; struct mbuf *f, *clast; u_int cc, ctl, mbcnt; u_int dcc __diagused, dctl __diagused, dmbcnt __diagused; int error; MPASS((uio != NULL && m == NULL) || (m != NULL && uio == NULL)); error = 0; f = NULL; ctl = 0; if (__predict_false(flags & MSG_OOB)) { error = EOPNOTSUPP; goto out; } if (m == NULL) { if (__predict_false(uio->uio_resid > unpdg_maxdgram)) { error = EMSGSIZE; goto out; } m = m_uiotombuf(uio, M_WAITOK, 0, max_hdr, M_PKTHDR); if (__predict_false(m == NULL)) { error = EFAULT; goto out; } f = m_gethdr(M_WAITOK, MT_SONAME); cc = m->m_pkthdr.len; mbcnt = MSIZE + m->m_pkthdr.memlen; if (c != NULL && (error = unp_internalize(&c, td, &clast, &ctl, &mbcnt))) goto out; } else { /* pr_sosend() with mbuf usually is a kernel thread. */ M_ASSERTPKTHDR(m); if (__predict_false(c != NULL)) panic("%s: control from a kernel thread", __func__); if (__predict_false(m->m_pkthdr.len > unpdg_maxdgram)) { error = EMSGSIZE; goto out; } if ((f = m_gethdr(M_NOWAIT, MT_SONAME)) == NULL) { error = ENOBUFS; goto out; } /* Condition the foreign mbuf to our standards. */ m_clrprotoflags(m); m_tag_delete_chain(m, NULL); m->m_pkthdr.rcvif = NULL; m->m_pkthdr.flowid = 0; m->m_pkthdr.csum_flags = 0; m->m_pkthdr.fibnum = 0; m->m_pkthdr.rsstype = 0; cc = m->m_pkthdr.len; mbcnt = MSIZE; for (struct mbuf *mb = m; mb != NULL; mb = mb->m_next) { mbcnt += MSIZE; if (mb->m_flags & M_EXT) mbcnt += mb->m_ext.ext_size; } } unp = sotounpcb(so); MPASS(unp); /* * XXXGL: would be cool to fully remove so_snd out of the equation * and avoid this lock, which is not only extraneous, but also being * released, thus still leaving possibility for a race. We can easily * handle SBS_CANTSENDMORE/SS_ISCONNECTED complement in unpcb, but it * is more difficult to invent something to handle so_error. */ error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) goto out2; SOCK_SENDBUF_LOCK(so); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCK_SENDBUF_UNLOCK(so); error = EPIPE; goto out3; } if (so->so_error != 0) { error = so->so_error; so->so_error = 0; SOCK_SENDBUF_UNLOCK(so); goto out3; } if (((so->so_state & SS_ISCONNECTED) == 0) && addr == NULL) { SOCK_SENDBUF_UNLOCK(so); error = EDESTADDRREQ; goto out3; } SOCK_SENDBUF_UNLOCK(so); if (addr != NULL) { if ((error = unp_connectat(AT_FDCWD, so, addr, td, true))) goto out3; UNP_PCB_LOCK_ASSERT(unp); unp2 = unp->unp_conn; UNP_PCB_LOCK_ASSERT(unp2); } else { UNP_PCB_LOCK(unp); unp2 = unp_pcb_lock_peer(unp); if (unp2 == NULL) { UNP_PCB_UNLOCK(unp); error = ENOTCONN; goto out3; } } if (unp2->unp_flags & UNP_WANTCRED_MASK) c = unp_addsockcred(td, c, unp2->unp_flags, &clast, &ctl, &mbcnt); if (unp->unp_addr != NULL) from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; f->m_len = from->sa_len; MPASS(from->sa_len <= MLEN); bcopy(from, mtod(f, void *), from->sa_len); ctl += f->m_len; /* * Concatenate mbufs: from -> control -> data. * Save overall cc and mbcnt in "from" mbuf. */ if (c != NULL) { #ifdef INVARIANTS struct mbuf *mc; for (mc = c; mc->m_next != NULL; mc = mc->m_next); MPASS(mc == clast); #endif f->m_next = c; clast->m_next = m; c = NULL; } else f->m_next = m; m = NULL; #ifdef INVARIANTS dcc = dctl = dmbcnt = 0; for (struct mbuf *mb = f; mb != NULL; mb = mb->m_next) { if (mb->m_type == MT_DATA) dcc += mb->m_len; else dctl += mb->m_len; dmbcnt += MSIZE; if (mb->m_flags & M_EXT) dmbcnt += mb->m_ext.ext_size; } MPASS(dcc == cc); MPASS(dctl == ctl); MPASS(dmbcnt == mbcnt); #endif f->m_pkthdr.len = cc + ctl; f->m_pkthdr.memlen = mbcnt; f->m_pkthdr.ctllen = ctl; /* * Destination socket buffer selection. * * Unconnected sends, when !(so->so_state & SS_ISCONNECTED) and the * destination address is supplied, create a temporary connection for * the run time of the function (see call to unp_connectat() above and * to unp_disconnect() below). We distinguish them by condition of * (addr != NULL). We intentionally avoid adding 'bool connected' for * that condition, since, again, through the run time of this code we * are always connected. For such "unconnected" sends, the destination * buffer would be the receive buffer of destination socket so2. * * For connected sends, data lands on the send buffer of the sender's * socket "so". Then, if we just added the very first datagram * on this send buffer, we need to add the send buffer on to the * receiving socket's buffer list. We put ourselves on top of the * list. Such logic gives infrequent senders priority over frequent * senders. * * Note on byte count management. As long as event methods kevent(2), * select(2) are not protocol specific (yet), we need to maintain * meaningful values on the receive buffer. So, the receive buffer * would accumulate counters from all connected buffers potentially * having sb_ccc > sb_hiwat or sb_mbcnt > sb_mbmax. */ so2 = unp2->unp_socket; sb = (addr == NULL) ? &so->so_snd : &so2->so_rcv; SOCK_RECVBUF_LOCK(so2); if (uipc_dgram_sbspace(sb, cc + ctl, mbcnt)) { if (addr == NULL && STAILQ_EMPTY(&sb->uxdg_mb)) TAILQ_INSERT_HEAD(&so2->so_rcv.uxdg_conns, &so->so_snd, uxdg_clist); STAILQ_INSERT_TAIL(&sb->uxdg_mb, f, m_stailqpkt); sb->uxdg_cc += cc + ctl; sb->uxdg_ctl += ctl; sb->uxdg_mbcnt += mbcnt; so2->so_rcv.sb_acc += cc + ctl; so2->so_rcv.sb_ccc += cc + ctl; so2->so_rcv.sb_ctl += ctl; so2->so_rcv.sb_mbcnt += mbcnt; sorwakeup_locked(so2); f = NULL; } else { soroverflow_locked(so2); error = (so->so_state & SS_NBIO) ? EAGAIN : ENOBUFS; } if (addr != NULL) unp_disconnect(unp, unp2); else unp_pcb_unlock_pair(unp, unp2); td->td_ru.ru_msgsnd++; out3: SOCK_IO_SEND_UNLOCK(so); out2: if (c) unp_scan(c, unp_freerights); out: if (f) m_freem(f); if (c) m_freem(c); if (m) m_freem(m); return (error); } /* * PF_UNIX/SOCK_DGRAM receive with MSG_PEEK. * The mbuf has already been unlinked from the uxdg_mb of socket buffer * and needs to be linked onto uxdg_peeked of receive socket buffer. */ static int uipc_peek_dgram(struct socket *so, struct mbuf *m, struct sockaddr **psa, struct uio *uio, struct mbuf **controlp, int *flagsp) { ssize_t len = 0; int error; so->so_rcv.uxdg_peeked = m; so->so_rcv.uxdg_cc += m->m_pkthdr.len; so->so_rcv.uxdg_ctl += m->m_pkthdr.ctllen; so->so_rcv.uxdg_mbcnt += m->m_pkthdr.memlen; SOCK_RECVBUF_UNLOCK(so); KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK); m = m->m_next; KASSERT(m, ("%s: no data or control after soname", __func__)); /* * With MSG_PEEK the control isn't executed, just copied. */ while (m != NULL && m->m_type == MT_CONTROL) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_WAITOK); controlp = &(*controlp)->m_next; } m = m->m_next; } KASSERT(m == NULL || m->m_type == MT_DATA, ("%s: not MT_DATA mbuf %p", __func__, m)); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { SOCK_IO_RECV_UNLOCK(so); return (error); } if (len == m->m_len) m = m->m_next; } SOCK_IO_RECV_UNLOCK(so); if (flagsp != NULL) { if (m != NULL) { if (*flagsp & MSG_TRUNC) { /* Report real length of the packet */ uio->uio_resid -= m_length(m, NULL) - len; } *flagsp |= MSG_TRUNC; } else *flagsp &= ~MSG_TRUNC; } return (0); } /* * PF_UNIX/SOCK_DGRAM receive */ static int uipc_soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct sockbuf *sb = NULL; struct mbuf *m; int flags, error; ssize_t len = 0; bool nonblock; MPASS(mp0 == NULL); if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; flags = flagsp != NULL ? *flagsp : 0; nonblock = (so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT | MSG_NBIO)); error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (__predict_false(error)) return (error); /* * Loop blocking while waiting for a datagram. Prioritize connected * peers over unconnected sends. Set sb to selected socket buffer * containing an mbuf on exit from the wait loop. A datagram that * had already been peeked at has top priority. */ SOCK_RECVBUF_LOCK(so); while ((m = so->so_rcv.uxdg_peeked) == NULL && (sb = TAILQ_FIRST(&so->so_rcv.uxdg_conns)) == NULL && (m = STAILQ_FIRST(&so->so_rcv.uxdg_mb)) == NULL) { if (so->so_error) { error = so->so_error; so->so_error = 0; SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); return (0); } if (nonblock) { SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); return (EWOULDBLOCK); } error = sbwait(so, SO_RCV); if (error) { SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); return (error); } } if (sb == NULL) sb = &so->so_rcv; else if (m == NULL) m = STAILQ_FIRST(&sb->uxdg_mb); else MPASS(m == so->so_rcv.uxdg_peeked); MPASS(sb->uxdg_cc > 0); M_ASSERTPKTHDR(m); KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; if (__predict_true(m != so->so_rcv.uxdg_peeked)) { STAILQ_REMOVE_HEAD(&sb->uxdg_mb, m_stailqpkt); if (STAILQ_EMPTY(&sb->uxdg_mb) && sb != &so->so_rcv) TAILQ_REMOVE(&so->so_rcv.uxdg_conns, sb, uxdg_clist); } else so->so_rcv.uxdg_peeked = NULL; sb->uxdg_cc -= m->m_pkthdr.len; sb->uxdg_ctl -= m->m_pkthdr.ctllen; sb->uxdg_mbcnt -= m->m_pkthdr.memlen; if (__predict_false(flags & MSG_PEEK)) return (uipc_peek_dgram(so, m, psa, uio, controlp, flagsp)); so->so_rcv.sb_acc -= m->m_pkthdr.len; so->so_rcv.sb_ccc -= m->m_pkthdr.len; so->so_rcv.sb_ctl -= m->m_pkthdr.ctllen; so->so_rcv.sb_mbcnt -= m->m_pkthdr.memlen; SOCK_RECVBUF_UNLOCK(so); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK); m = m_free(m); KASSERT(m, ("%s: no data or control after soname", __func__)); /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * unp_externalize() to perform externalization (or freeing if * controlp == NULL). In some cases there can be only MT_CONTROL mbufs * without MT_DATA mbufs. */ while (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm; /* XXXGL: unp_externalize() is also dom_externalize() KBI and * it frees whole chain, so we must disconnect the mbuf. */ cm = m; m = m->m_next; cm->m_next = NULL; error = unp_externalize(cm, controlp, flags); if (error != 0) { SOCK_IO_RECV_UNLOCK(so); unp_scan(m, unp_freerights); m_freem(m); return (error); } if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("%s: not MT_DATA mbuf %p", __func__, m)); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { SOCK_IO_RECV_UNLOCK(so); m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } SOCK_IO_RECV_UNLOCK(so); if (m != NULL) { if (flagsp != NULL) { if (flags & MSG_TRUNC) { /* Report real length of the packet */ uio->uio_resid -= m_length(m, NULL); } *flagsp |= MSG_TRUNC; } m_freem(m); } else if (flagsp != NULL) *flagsp &= ~MSG_TRUNC; return (0); } static bool uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp) { struct mbuf *mb, *n; struct sockbuf *sb; SOCK_LOCK(so); if (SOLISTENING(so)) { SOCK_UNLOCK(so); return (false); } mb = NULL; sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (sb->sb_fnrdy != NULL) { for (mb = sb->sb_mb, n = mb->m_nextpkt; mb != NULL;) { if (mb == m) { *errorp = sbready(sb, m, count); break; } mb = mb->m_next; if (mb == NULL) { mb = n; if (mb != NULL) n = mb->m_nextpkt; } } } SOCKBUF_UNLOCK(sb); SOCK_UNLOCK(so); return (mb != NULL); } static int uipc_ready(struct socket *so, struct mbuf *m, int count) { struct unpcb *unp, *unp2; struct socket *so2; int error, i; unp = sotounpcb(so); KASSERT(so->so_type == SOCK_STREAM, ("%s: unexpected socket type for %p", __func__, so)); UNP_PCB_LOCK(unp); if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) { UNP_PCB_UNLOCK(unp); so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); if ((error = sbready(&so2->so_rcv, m, count)) == 0) sorwakeup_locked(so2); else SOCKBUF_UNLOCK(&so2->so_rcv); UNP_PCB_UNLOCK(unp2); return (error); } UNP_PCB_UNLOCK(unp); /* * The receiving socket has been disconnected, but may still be valid. * In this case, the now-ready mbufs are still present in its socket * buffer, so perform an exhaustive search before giving up and freeing * the mbufs. */ UNP_LINK_RLOCK(); LIST_FOREACH(unp, &unp_shead, unp_link) { if (uipc_ready_scan(unp->unp_socket, m, count, &error)) break; } UNP_LINK_RUNLOCK(); if (unp == NULL) { for (i = 0; i < count; i++) m = m_free(m); error = ECONNRESET; } return (error); } static int uipc_sense(struct socket *so, struct stat *sb) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); sb->st_blksize = so->so_snd.sb_hiwat; sb->st_dev = NODEV; sb->st_ino = unp->unp_ino; return (0); } static int uipc_shutdown(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); UNP_PCB_UNLOCK(unp); return (0); } static int uipc_sockaddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_PCB_LOCK(unp); if (unp->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp); return (0); } static int uipc_ctloutput(struct socket *so, struct sockopt *sopt) { struct unpcb *unp; struct xucred xu; int error, optval; if (sopt->sopt_level != SOL_LOCAL) return (EINVAL); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case LOCAL_PEERCRED: UNP_PCB_LOCK(unp); if (unp->unp_flags & UNP_HAVEPC) xu = unp->unp_peercred; else { if (so->so_type == SOCK_STREAM) error = ENOTCONN; else error = EINVAL; } UNP_PCB_UNLOCK(unp); if (error == 0) error = sooptcopyout(sopt, &xu, sizeof(xu)); break; case LOCAL_CREDS: /* Unlocked read. */ optval = unp->unp_flags & UNP_WANTCRED_ONESHOT ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case LOCAL_CREDS_PERSISTENT: /* Unlocked read. */ optval = unp->unp_flags & UNP_WANTCRED_ALWAYS ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case LOCAL_CONNWAIT: /* Unlocked read. */ optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EOPNOTSUPP; break; } break; case SOPT_SET: switch (sopt->sopt_name) { case LOCAL_CREDS: case LOCAL_CREDS_PERSISTENT: case LOCAL_CONNWAIT: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; #define OPTSET(bit, exclusive) do { \ UNP_PCB_LOCK(unp); \ if (optval) { \ if ((unp->unp_flags & (exclusive)) != 0) { \ UNP_PCB_UNLOCK(unp); \ error = EINVAL; \ break; \ } \ unp->unp_flags |= (bit); \ } else \ unp->unp_flags &= ~(bit); \ UNP_PCB_UNLOCK(unp); \ } while (0) switch (sopt->sopt_name) { case LOCAL_CREDS: OPTSET(UNP_WANTCRED_ONESHOT, UNP_WANTCRED_ALWAYS); break; case LOCAL_CREDS_PERSISTENT: OPTSET(UNP_WANTCRED_ALWAYS, UNP_WANTCRED_ONESHOT); break; case LOCAL_CONNWAIT: OPTSET(UNP_CONNWAIT, 0); break; default: break; } break; #undef OPTSET default: error = ENOPROTOOPT; break; } break; default: error = EOPNOTSUPP; break; } return (error); } static int unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (unp_connectat(AT_FDCWD, so, nam, td, false)); } static int unp_connectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td, bool return_locked) { struct mtx *vplock; struct sockaddr_un *soun; struct vnode *vp; struct socket *so2; struct unpcb *unp, *unp2, *unp3; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; struct sockaddr *sa; cap_rights_t rights; int error, len; bool connreq; if (nam->sa_family != AF_UNIX) return (EAFNOSUPPORT); if (nam->sa_len > sizeof(struct sockaddr_un)) return (EINVAL); len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) return (EINVAL); soun = (struct sockaddr_un *)nam; bcopy(soun->sun_path, buf, len); buf[len] = 0; error = 0; unp = sotounpcb(so); UNP_PCB_LOCK(unp); for (;;) { /* * Wait for connection state to stabilize. If a connection * already exists, give up. For datagram sockets, which permit * multiple consecutive connect(2) calls, upper layers are * responsible for disconnecting in advance of a subsequent * connect(2), but this is not synchronized with PCB connection * state. * * Also make sure that no threads are currently attempting to * lock the peer socket, to ensure that unp_conn cannot * transition between two valid sockets while locks are dropped. */ if (SOLISTENING(so)) error = EOPNOTSUPP; else if (unp->unp_conn != NULL) error = EISCONN; else if ((unp->unp_flags & UNP_CONNECTING) != 0) { error = EALREADY; } if (error != 0) { UNP_PCB_UNLOCK(unp); return (error); } if (unp->unp_pairbusy > 0) { unp->unp_flags |= UNP_WAITING; mtx_sleep(unp, UNP_PCB_LOCKPTR(unp), 0, "unpeer", 0); continue; } break; } unp->unp_flags |= UNP_CONNECTING; UNP_PCB_UNLOCK(unp); connreq = (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0; if (connreq) sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); else sa = NULL; NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_CONNECTAT)); error = namei(&nd); if (error) vp = NULL; else vp = nd.ni_vp; ASSERT_VOP_LOCKED(vp, "unp_connect"); - NDFREE_NOTHING(&nd); if (error) goto bad; + NDFREE_PNBUF(&nd); if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD); if (error) goto bad; #endif error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); if (error) goto bad; unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); VOP_UNP_CONNECT(vp, &unp2); if (unp2 == NULL) { error = ECONNREFUSED; goto bad2; } so2 = unp2->unp_socket; if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad2; } if (connreq) { if (SOLISTENING(so2)) { CURVNET_SET(so2->so_vnet); so2 = sonewconn(so2, 0); CURVNET_RESTORE(); } else so2 = NULL; if (so2 == NULL) { error = ECONNREFUSED; goto bad2; } unp3 = sotounpcb(so2); unp_pcb_lock_pair(unp2, unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); unp3->unp_addr = (struct sockaddr_un *) sa; sa = NULL; } unp_copy_peercred(td, unp3, unp, unp2); UNP_PCB_UNLOCK(unp2); unp2 = unp3; /* * It is safe to block on the PCB lock here since unp2 is * nascent and cannot be connected to any other sockets. */ UNP_PCB_LOCK(unp); #ifdef MAC mac_socketpeer_set_from_socket(so, so2); mac_socketpeer_set_from_socket(so2, so); #endif } else { unp_pcb_lock_pair(unp, unp2); } KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 && sotounpcb(so2) == unp2, ("%s: unp2 %p so2 %p", __func__, unp2, so2)); unp_connect2(so, so2, PRU_CONNECT); KASSERT((unp->unp_flags & UNP_CONNECTING) != 0, ("%s: unp %p has UNP_CONNECTING clear", __func__, unp)); unp->unp_flags &= ~UNP_CONNECTING; if (!return_locked) unp_pcb_unlock_pair(unp, unp2); bad2: mtx_unlock(vplock); bad: if (vp != NULL) { /* * If we are returning locked (called via uipc_sosend_dgram()), * we need to be sure that vput() won't sleep. This is * guaranteed by VOP_UNP_CONNECT() call above and unp2 lock. * SOCK_STREAM/SEQPACKET can't request return_locked (yet). */ MPASS(!(return_locked && connreq)); vput(vp); } free(sa, M_SONAME); if (__predict_false(error)) { UNP_PCB_LOCK(unp); KASSERT((unp->unp_flags & UNP_CONNECTING) != 0, ("%s: unp %p has UNP_CONNECTING clear", __func__, unp)); unp->unp_flags &= ~UNP_CONNECTING; UNP_PCB_UNLOCK(unp); } return (error); } /* * Set socket peer credentials at connection time. * * The client's PCB credentials are copied from its process structure. The * server's PCB credentials are copied from the socket on which it called * listen(2). uipc_listen cached that process's credentials at the time. */ void unp_copy_peercred(struct thread *td, struct unpcb *client_unp, struct unpcb *server_unp, struct unpcb *listen_unp) { cru2xt(td, &client_unp->unp_peercred); client_unp->unp_flags |= UNP_HAVEPC; memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred, sizeof(server_unp->unp_peercred)); server_unp->unp_flags |= UNP_HAVEPC; client_unp->unp_flags |= (listen_unp->unp_flags & UNP_WANTCRED_MASK); } static void unp_connect2(struct socket *so, struct socket *so2, conn2_how req) { struct unpcb *unp; struct unpcb *unp2; MPASS(so2->so_type == so->so_type); unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect2: unp == NULL")); unp2 = sotounpcb(so2); KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); UNP_PCB_LOCK_ASSERT(unp); UNP_PCB_LOCK_ASSERT(unp2); KASSERT(unp->unp_conn == NULL, ("%s: socket %p is already connected", __func__, unp)); unp->unp_conn = unp2; unp_pcb_hold(unp2); unp_pcb_hold(unp); switch (so->so_type) { case SOCK_DGRAM: UNP_REF_LIST_LOCK(); LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); UNP_REF_LIST_UNLOCK(); soisconnected(so); break; case SOCK_STREAM: case SOCK_SEQPACKET: KASSERT(unp2->unp_conn == NULL, ("%s: socket %p is already connected", __func__, unp2)); unp2->unp_conn = unp; if (req == PRU_CONNECT && ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) soisconnecting(so); else soisconnected(so); soisconnected(so2); break; default: panic("unp_connect2"); } } static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2) { struct socket *so, *so2; struct mbuf *m = NULL; #ifdef INVARIANTS struct unpcb *unptmp; #endif UNP_PCB_LOCK_ASSERT(unp); UNP_PCB_LOCK_ASSERT(unp2); KASSERT(unp->unp_conn == unp2, ("%s: unpcb %p is not connected to %p", __func__, unp, unp2)); unp->unp_conn = NULL; so = unp->unp_socket; so2 = unp2->unp_socket; switch (unp->unp_socket->so_type) { case SOCK_DGRAM: /* * Remove our send socket buffer from the peer's receive buffer. * Move the data to the receive buffer only if it is empty. * This is a protection against a scenario where a peer * connects, floods and disconnects, effectively blocking * sendto() from unconnected sockets. */ SOCK_RECVBUF_LOCK(so2); if (!STAILQ_EMPTY(&so->so_snd.uxdg_mb)) { TAILQ_REMOVE(&so2->so_rcv.uxdg_conns, &so->so_snd, uxdg_clist); if (__predict_true((so2->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) && STAILQ_EMPTY(&so2->so_rcv.uxdg_mb)) { STAILQ_CONCAT(&so2->so_rcv.uxdg_mb, &so->so_snd.uxdg_mb); so2->so_rcv.uxdg_cc += so->so_snd.uxdg_cc; so2->so_rcv.uxdg_ctl += so->so_snd.uxdg_ctl; so2->so_rcv.uxdg_mbcnt += so->so_snd.uxdg_mbcnt; } else { m = STAILQ_FIRST(&so->so_snd.uxdg_mb); STAILQ_INIT(&so->so_snd.uxdg_mb); so2->so_rcv.sb_acc -= so->so_snd.uxdg_cc; so2->so_rcv.sb_ccc -= so->so_snd.uxdg_cc; so2->so_rcv.sb_ctl -= so->so_snd.uxdg_ctl; so2->so_rcv.sb_mbcnt -= so->so_snd.uxdg_mbcnt; } /* Note: so may reconnect. */ so->so_snd.uxdg_cc = 0; so->so_snd.uxdg_ctl = 0; so->so_snd.uxdg_mbcnt = 0; } SOCK_RECVBUF_UNLOCK(so2); UNP_REF_LIST_LOCK(); #ifdef INVARIANTS LIST_FOREACH(unptmp, &unp2->unp_refs, unp_reflink) { if (unptmp == unp) break; } KASSERT(unptmp != NULL, ("%s: %p not found in reflist of %p", __func__, unp, unp2)); #endif LIST_REMOVE(unp, unp_reflink); UNP_REF_LIST_UNLOCK(); if (so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); } break; case SOCK_STREAM: case SOCK_SEQPACKET: if (so) soisdisconnected(so); MPASS(unp2->unp_conn == unp); unp2->unp_conn = NULL; if (so2) soisdisconnected(so2); break; } if (unp == unp2) { unp_pcb_rele_notlast(unp); if (!unp_pcb_rele(unp)) UNP_PCB_UNLOCK(unp); } else { if (!unp_pcb_rele(unp)) UNP_PCB_UNLOCK(unp); if (!unp_pcb_rele(unp2)) UNP_PCB_UNLOCK(unp2); } if (m != NULL) { unp_scan(m, unp_freerights); m_freem(m); } } /* * unp_pcblist() walks the global list of struct unpcb's to generate a * pointer list, bumping the refcount on each unpcb. It then copies them out * sequentially, validating the generation number on each to see if it has * been detached. All of this is necessary because copyout() may sleep on * disk I/O. */ static int unp_pcblist(SYSCTL_HANDLER_ARGS) { struct unpcb *unp, **unp_list; unp_gen_t gencnt; struct xunpgen *xug; struct unp_head *head; struct xunpcb *xu; u_int i; int error, n; switch ((intptr_t)arg1) { case SOCK_STREAM: head = &unp_shead; break; case SOCK_DGRAM: head = &unp_dhead; break; case SOCK_SEQPACKET: head = &unp_sphead; break; default: panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1); } /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = unp_count; req->oldidx = 2 * (sizeof *xug) + (n + n/8) * sizeof(struct xunpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO); UNP_LINK_RLOCK(); gencnt = unp_gencnt; n = unp_count; UNP_LINK_RUNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; xug->xug_gen = gencnt; xug->xug_sogen = so_gencnt; error = SYSCTL_OUT(req, xug, sizeof *xug); if (error) { free(xug, M_TEMP); return (error); } unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); UNP_LINK_RLOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { UNP_PCB_LOCK(unp); if (unp->unp_gencnt <= gencnt) { if (cr_cansee(req->td->td_ucred, unp->unp_socket->so_cred)) { UNP_PCB_UNLOCK(unp); continue; } unp_list[i++] = unp; unp_pcb_hold(unp); } UNP_PCB_UNLOCK(unp); } UNP_LINK_RUNLOCK(); n = i; /* In case we lost some during malloc. */ error = 0; xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); for (i = 0; i < n; i++) { unp = unp_list[i]; UNP_PCB_LOCK(unp); if (unp_pcb_rele(unp)) continue; if (unp->unp_gencnt <= gencnt) { xu->xu_len = sizeof *xu; xu->xu_unpp = (uintptr_t)unp; /* * XXX - need more locking here to protect against * connect/disconnect races for SMP. */ if (unp->unp_addr != NULL) bcopy(unp->unp_addr, &xu->xu_addr, unp->unp_addr->sun_len); else bzero(&xu->xu_addr, sizeof(xu->xu_addr)); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) bcopy(unp->unp_conn->unp_addr, &xu->xu_caddr, unp->unp_conn->unp_addr->sun_len); else bzero(&xu->xu_caddr, sizeof(xu->xu_caddr)); xu->unp_vnode = (uintptr_t)unp->unp_vnode; xu->unp_conn = (uintptr_t)unp->unp_conn; xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs); xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink); xu->unp_gencnt = unp->unp_gencnt; sotoxsocket(unp->unp_socket, &xu->xu_socket); UNP_PCB_UNLOCK(unp); error = SYSCTL_OUT(req, xu, sizeof *xu); } else { UNP_PCB_UNLOCK(unp); } } free(xu, M_TEMP); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ xug->xug_gen = unp_gencnt; xug->xug_sogen = so_gencnt; xug->xug_count = unp_count; error = SYSCTL_OUT(req, xug, sizeof *xug); } free(unp_list, M_TEMP); free(xug, M_TEMP); return (error); } SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", "List of active local datagram sockets"); SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb", "List of active local seqpacket sockets"); static void unp_shutdown(struct unpcb *unp) { struct unpcb *unp2; struct socket *so; UNP_PCB_LOCK_ASSERT(unp); unp2 = unp->unp_conn; if ((unp->unp_socket->so_type == SOCK_STREAM || (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) { so = unp2->unp_socket; if (so != NULL) socantrcvmore(so); } } static void unp_drop(struct unpcb *unp) { struct socket *so; struct unpcb *unp2; /* * Regardless of whether the socket's peer dropped the connection * with this socket by aborting or disconnecting, POSIX requires * that ECONNRESET is returned. */ UNP_PCB_LOCK(unp); so = unp->unp_socket; if (so) so->so_error = ECONNRESET; if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) { /* Last reference dropped in unp_disconnect(). */ unp_pcb_rele_notlast(unp); unp_disconnect(unp, unp2); } else if (!unp_pcb_rele(unp)) { UNP_PCB_UNLOCK(unp); } } static void unp_freerights(struct filedescent **fdep, int fdcount) { struct file *fp; int i; KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount)); for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; filecaps_free(&fdep[i]->fde_caps); unp_discard(fp); } free(fdep[0], M_FILECAPS); } static int unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags) { struct thread *td = curthread; /* XXX */ struct cmsghdr *cm = mtod(control, struct cmsghdr *); int i; int *fdp; struct filedesc *fdesc = td->td_proc->p_fd; struct filedescent **fdep; void *data; socklen_t clen = control->m_len, datalen; int error, newfds; u_int newlen; UNP_LINK_UNLOCK_ASSERT(); error = 0; if (controlp != NULL) /* controlp == NULL => free control messages */ *controlp = NULL; while (cm != NULL) { MPASS(clen >= sizeof(*cm) && clen >= cm->cmsg_len); data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { newfds = datalen / sizeof(*fdep); if (newfds == 0) goto next; fdep = data; /* If we're not outputting the descriptors free them. */ if (error || controlp == NULL) { unp_freerights(fdep, newfds); goto next; } FILEDESC_XLOCK(fdesc); /* * Now change each pointer to an fd in the global * table to an integer that is the index to the local * fd table entry that we set up to point to the * global one we are transferring. */ newlen = newfds * sizeof(int); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET, M_WAITOK); fdp = (int *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); if ((error = fdallocn(td, 0, fdp, newfds))) { FILEDESC_XUNLOCK(fdesc); unp_freerights(fdep, newfds); m_freem(*controlp); *controlp = NULL; goto next; } for (i = 0; i < newfds; i++, fdp++) { _finstall(fdesc, fdep[i]->fde_file, *fdp, (flags & MSG_CMSG_CLOEXEC) != 0 ? O_CLOEXEC : 0, &fdep[i]->fde_caps); unp_externalize_fp(fdep[i]->fde_file); } /* * The new type indicates that the mbuf data refers to * kernel resources that may need to be released before * the mbuf is freed. */ m_chtype(*controlp, MT_EXTCONTROL); FILEDESC_XUNLOCK(fdesc); free(fdep[0], M_FILECAPS); } else { /* We can just copy anything else across. */ if (error || controlp == NULL) goto next; *controlp = sbcreatecontrol(NULL, datalen, cm->cmsg_type, cm->cmsg_level, M_WAITOK); bcopy(data, CMSG_DATA(mtod(*controlp, struct cmsghdr *)), datalen); } controlp = &(*controlp)->m_next; next: if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } m_freem(control); return (error); } static void unp_zone_change(void *tag) { uma_zone_set_max(unp_zone, maxsockets); } #ifdef INVARIANTS static void unp_zdtor(void *mem, int size __unused, void *arg __unused) { struct unpcb *unp; unp = mem; KASSERT(LIST_EMPTY(&unp->unp_refs), ("%s: unpcb %p has lingering refs", __func__, unp)); KASSERT(unp->unp_socket == NULL, ("%s: unpcb %p has socket backpointer", __func__, unp)); KASSERT(unp->unp_vnode == NULL, ("%s: unpcb %p has vnode references", __func__, unp)); KASSERT(unp->unp_conn == NULL, ("%s: unpcb %p is still connected", __func__, unp)); KASSERT(unp->unp_addr == NULL, ("%s: unpcb %p has leaked addr", __func__, unp)); } #endif static void unp_init(void *arg __unused) { uma_dtor dtor; #ifdef INVARIANTS dtor = unp_zdtor; #else dtor = NULL; #endif unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, dtor, NULL, NULL, UMA_ALIGN_CACHE, 0); uma_zone_set_max(unp_zone, maxsockets); uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, NULL, EVENTHANDLER_PRI_ANY); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); LIST_INIT(&unp_sphead); SLIST_INIT(&unp_defers); TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL); TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL); UNP_LINK_LOCK_INIT(); UNP_DEFERRED_LOCK_INIT(); } SYSINIT(unp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, unp_init, NULL); static void unp_internalize_cleanup_rights(struct mbuf *control) { struct cmsghdr *cp; struct mbuf *m; void *data; socklen_t datalen; for (m = control; m != NULL; m = m->m_next) { cp = mtod(m, struct cmsghdr *); if (cp->cmsg_level != SOL_SOCKET || cp->cmsg_type != SCM_RIGHTS) continue; data = CMSG_DATA(cp); datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data; unp_freerights(data, datalen / sizeof(struct filedesc *)); } } static int unp_internalize(struct mbuf **controlp, struct thread *td, struct mbuf **clast, u_int *space, u_int *mbcnt) { struct mbuf *control, **initial_controlp; struct proc *p; struct filedesc *fdesc; struct bintime *bt; struct cmsghdr *cm; struct cmsgcred *cmcred; struct filedescent *fde, **fdep, *fdev; struct file *fp; struct timeval *tv; struct timespec *ts; void *data; socklen_t clen, datalen; int i, j, error, *fdp, oldfds; u_int newlen; MPASS((*controlp)->m_next == NULL); /* COMPAT_OLDSOCK may violate */ UNP_LINK_UNLOCK_ASSERT(); p = td->td_proc; fdesc = p->p_fd; error = 0; control = *controlp; *controlp = NULL; initial_controlp = controlp; for (clen = control->m_len, cm = mtod(control, struct cmsghdr *), data = CMSG_DATA(cm); clen >= sizeof(*cm) && cm->cmsg_level == SOL_SOCKET && clen >= cm->cmsg_len && cm->cmsg_len >= sizeof(*cm) && (char *)cm + cm->cmsg_len >= (char *)data; clen -= min(CMSG_SPACE(datalen), clen), cm = (struct cmsghdr *) ((char *)cm + CMSG_SPACE(datalen)), data = CMSG_DATA(cm)) { datalen = (char *)cm + cm->cmsg_len - (char *)data; switch (cm->cmsg_type) { case SCM_CREDS: *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), SCM_CREDS, SOL_SOCKET, M_WAITOK); cmcred = (struct cmsgcred *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); cmcred->cmcred_pid = p->p_pid; cmcred->cmcred_uid = td->td_ucred->cr_ruid; cmcred->cmcred_gid = td->td_ucred->cr_rgid; cmcred->cmcred_euid = td->td_ucred->cr_uid; cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); for (i = 0; i < cmcred->cmcred_ngroups; i++) cmcred->cmcred_groups[i] = td->td_ucred->cr_groups[i]; break; case SCM_RIGHTS: oldfds = datalen / sizeof (int); if (oldfds == 0) continue; /* On some machines sizeof pointer is bigger than * sizeof int, so we need to check if data fits into * single mbuf. We could allocate several mbufs, and * unp_externalize() should even properly handle that. * But it is not worth to complicate the code for an * insane scenario of passing over 200 file descriptors * at once. */ newlen = oldfds * sizeof(fdep[0]); if (CMSG_SPACE(newlen) > MCLBYTES) { error = EMSGSIZE; goto out; } /* * Check that all the FDs passed in refer to legal * files. If not, reject the entire operation. */ fdp = data; FILEDESC_SLOCK(fdesc); for (i = 0; i < oldfds; i++, fdp++) { fp = fget_noref(fdesc, *fdp); if (fp == NULL) { FILEDESC_SUNLOCK(fdesc); error = EBADF; goto out; } if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { FILEDESC_SUNLOCK(fdesc); error = EOPNOTSUPP; goto out; } } /* * Now replace the integer FDs with pointers to the * file structure and capability rights. */ *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET, M_WAITOK); fdp = data; for (i = 0; i < oldfds; i++, fdp++) { if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) { fdp = data; for (j = 0; j < i; j++, fdp++) { fdrop(fdesc->fd_ofiles[*fdp]. fde_file, td); } FILEDESC_SUNLOCK(fdesc); error = EBADF; goto out; } } fdp = data; fdep = (struct filedescent **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS, M_WAITOK); for (i = 0; i < oldfds; i++, fdev++, fdp++) { fde = &fdesc->fd_ofiles[*fdp]; fdep[i] = fdev; fdep[i]->fde_file = fde->fde_file; filecaps_copy(&fde->fde_caps, &fdep[i]->fde_caps, true); unp_internalize_fp(fdep[i]->fde_file); } FILEDESC_SUNLOCK(fdesc); break; case SCM_TIMESTAMP: *controlp = sbcreatecontrol(NULL, sizeof(*tv), SCM_TIMESTAMP, SOL_SOCKET, M_WAITOK); tv = (struct timeval *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); microtime(tv); break; case SCM_BINTIME: *controlp = sbcreatecontrol(NULL, sizeof(*bt), SCM_BINTIME, SOL_SOCKET, M_WAITOK); bt = (struct bintime *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); bintime(bt); break; case SCM_REALTIME: *controlp = sbcreatecontrol(NULL, sizeof(*ts), SCM_REALTIME, SOL_SOCKET, M_WAITOK); ts = (struct timespec *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); nanotime(ts); break; case SCM_MONOTONIC: *controlp = sbcreatecontrol(NULL, sizeof(*ts), SCM_MONOTONIC, SOL_SOCKET, M_WAITOK); ts = (struct timespec *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); nanouptime(ts); break; default: error = EINVAL; goto out; } if (space != NULL) { *space += (*controlp)->m_len; *mbcnt += MSIZE; if ((*controlp)->m_flags & M_EXT) *mbcnt += (*controlp)->m_ext.ext_size; *clast = *controlp; } controlp = &(*controlp)->m_next; } if (clen > 0) error = EINVAL; out: if (error != 0 && initial_controlp != NULL) unp_internalize_cleanup_rights(*initial_controlp); m_freem(control); return (error); } static struct mbuf * unp_addsockcred(struct thread *td, struct mbuf *control, int mode, struct mbuf **clast, u_int *space, u_int *mbcnt) { struct mbuf *m, *n, *n_prev; const struct cmsghdr *cm; int ngroups, i, cmsgtype; size_t ctrlsz; ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); if (mode & UNP_WANTCRED_ALWAYS) { ctrlsz = SOCKCRED2SIZE(ngroups); cmsgtype = SCM_CREDS2; } else { ctrlsz = SOCKCREDSIZE(ngroups); cmsgtype = SCM_CREDS; } m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET, M_NOWAIT); if (m == NULL) return (control); MPASS((m->m_flags & M_EXT) == 0 && m->m_next == NULL); if (mode & UNP_WANTCRED_ALWAYS) { struct sockcred2 *sc; sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *)); sc->sc_version = 0; sc->sc_pid = td->td_proc->p_pid; sc->sc_uid = td->td_ucred->cr_ruid; sc->sc_euid = td->td_ucred->cr_uid; sc->sc_gid = td->td_ucred->cr_rgid; sc->sc_egid = td->td_ucred->cr_gid; sc->sc_ngroups = ngroups; for (i = 0; i < sc->sc_ngroups; i++) sc->sc_groups[i] = td->td_ucred->cr_groups[i]; } else { struct sockcred *sc; sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *)); sc->sc_uid = td->td_ucred->cr_ruid; sc->sc_euid = td->td_ucred->cr_uid; sc->sc_gid = td->td_ucred->cr_rgid; sc->sc_egid = td->td_ucred->cr_gid; sc->sc_ngroups = ngroups; for (i = 0; i < sc->sc_ngroups; i++) sc->sc_groups[i] = td->td_ucred->cr_groups[i]; } /* * Unlink SCM_CREDS control messages (struct cmsgcred), since just * created SCM_CREDS control message (struct sockcred) has another * format. */ if (control != NULL && cmsgtype == SCM_CREDS) for (n = control, n_prev = NULL; n != NULL;) { cm = mtod(n, struct cmsghdr *); if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_CREDS) { if (n_prev == NULL) control = n->m_next; else n_prev->m_next = n->m_next; if (space != NULL) { MPASS(*space >= n->m_len); *space -= n->m_len; MPASS(*mbcnt >= MSIZE); *mbcnt -= MSIZE; if (n->m_flags & M_EXT) { MPASS(*mbcnt >= n->m_ext.ext_size); *mbcnt -= n->m_ext.ext_size; } MPASS(clast); if (*clast == n) { MPASS(n->m_next == NULL); if (n_prev == NULL) *clast = m; else *clast = n_prev; } } n = m_free(n); } else { n_prev = n; n = n->m_next; } } /* Prepend it to the head. */ m->m_next = control; if (space != NULL) { *space += m->m_len; *mbcnt += MSIZE; if (control == NULL) *clast = m; } return (m); } static struct unpcb * fptounp(struct file *fp) { struct socket *so; if (fp->f_type != DTYPE_SOCKET) return (NULL); if ((so = fp->f_data) == NULL) return (NULL); if (so->so_proto->pr_domain != &localdomain) return (NULL); return sotounpcb(so); } static void unp_discard(struct file *fp) { struct unp_defer *dr; if (unp_externalize_fp(fp)) { dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK); dr->ud_fp = fp; UNP_DEFERRED_LOCK(); SLIST_INSERT_HEAD(&unp_defers, dr, ud_link); UNP_DEFERRED_UNLOCK(); atomic_add_int(&unp_defers_count, 1); taskqueue_enqueue(taskqueue_thread, &unp_defer_task); } else closef_nothread(fp); } static void unp_process_defers(void *arg __unused, int pending) { struct unp_defer *dr; SLIST_HEAD(, unp_defer) drl; int count; SLIST_INIT(&drl); for (;;) { UNP_DEFERRED_LOCK(); if (SLIST_FIRST(&unp_defers) == NULL) { UNP_DEFERRED_UNLOCK(); break; } SLIST_SWAP(&unp_defers, &drl, unp_defer); UNP_DEFERRED_UNLOCK(); count = 0; while ((dr = SLIST_FIRST(&drl)) != NULL) { SLIST_REMOVE_HEAD(&drl, ud_link); closef_nothread(dr->ud_fp); free(dr, M_TEMP); count++; } atomic_add_int(&unp_defers_count, -count); } } static void unp_internalize_fp(struct file *fp) { struct unpcb *unp; UNP_LINK_WLOCK(); if ((unp = fptounp(fp)) != NULL) { unp->unp_file = fp; unp->unp_msgcount++; } unp_rights++; UNP_LINK_WUNLOCK(); } static int unp_externalize_fp(struct file *fp) { struct unpcb *unp; int ret; UNP_LINK_WLOCK(); if ((unp = fptounp(fp)) != NULL) { unp->unp_msgcount--; ret = 1; } else ret = 0; unp_rights--; UNP_LINK_WUNLOCK(); return (ret); } /* * unp_defer indicates whether additional work has been defered for a future * pass through unp_gc(). It is thread local and does not require explicit * synchronization. */ static int unp_marked; static void unp_remove_dead_ref(struct filedescent **fdep, int fdcount) { struct unpcb *unp; struct file *fp; int i; /* * This function can only be called from the gc task. */ KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0, ("%s: not on gc callout", __func__)); UNP_LINK_LOCK_ASSERT(); for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; if ((unp = fptounp(fp)) == NULL) continue; if ((unp->unp_gcflag & UNPGC_DEAD) == 0) continue; unp->unp_gcrefs--; } } static void unp_restore_undead_ref(struct filedescent **fdep, int fdcount) { struct unpcb *unp; struct file *fp; int i; /* * This function can only be called from the gc task. */ KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0, ("%s: not on gc callout", __func__)); UNP_LINK_LOCK_ASSERT(); for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; if ((unp = fptounp(fp)) == NULL) continue; if ((unp->unp_gcflag & UNPGC_DEAD) == 0) continue; unp->unp_gcrefs++; unp_marked++; } } static void unp_scan_socket(struct socket *so, void (*op)(struct filedescent **, int)) { struct sockbuf *sb; SOCK_LOCK_ASSERT(so); if (sotounpcb(so)->unp_gcflag & UNPGC_IGNORE_RIGHTS) return; SOCK_RECVBUF_LOCK(so); switch (so->so_type) { case SOCK_DGRAM: unp_scan(STAILQ_FIRST(&so->so_rcv.uxdg_mb), op); unp_scan(so->so_rcv.uxdg_peeked, op); TAILQ_FOREACH(sb, &so->so_rcv.uxdg_conns, uxdg_clist) unp_scan(STAILQ_FIRST(&sb->uxdg_mb), op); break; case SOCK_STREAM: case SOCK_SEQPACKET: unp_scan(so->so_rcv.sb_mb, op); break; } SOCK_RECVBUF_UNLOCK(so); } static void unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int)) { struct socket *so, *soa; so = unp->unp_socket; SOCK_LOCK(so); if (SOLISTENING(so)) { /* * Mark all sockets in our accept queue. */ TAILQ_FOREACH(soa, &so->sol_comp, so_list) unp_scan_socket(soa, op); } else { /* * Mark all sockets we reference with RIGHTS. */ unp_scan_socket(so, op); } SOCK_UNLOCK(so); } static int unp_recycled; SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, "Number of unreachable sockets claimed by the garbage collector."); static int unp_taskcount; SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, "Number of times the garbage collector has run."); SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0, "Number of active local sockets."); static void unp_gc(__unused void *arg, int pending) { struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead, NULL }; struct unp_head **head; struct unp_head unp_deadhead; /* List of potentially-dead sockets. */ struct file *f, **unref; struct unpcb *unp, *unptmp; int i, total, unp_unreachable; LIST_INIT(&unp_deadhead); unp_taskcount++; UNP_LINK_RLOCK(); /* * First determine which sockets may be in cycles. */ unp_unreachable = 0; for (head = heads; *head != NULL; head++) LIST_FOREACH(unp, *head, unp_link) { KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0, ("%s: unp %p has unexpected gc flags 0x%x", __func__, unp, (unsigned int)unp->unp_gcflag)); f = unp->unp_file; /* * Check for an unreachable socket potentially in a * cycle. It must be in a queue as indicated by * msgcount, and this must equal the file reference * count. Note that when msgcount is 0 the file is * NULL. */ if (f != NULL && unp->unp_msgcount != 0 && refcount_load(&f->f_count) == unp->unp_msgcount) { LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead); unp->unp_gcflag |= UNPGC_DEAD; unp->unp_gcrefs = unp->unp_msgcount; unp_unreachable++; } } /* * Scan all sockets previously marked as potentially being in a cycle * and remove the references each socket holds on any UNPGC_DEAD * sockets in its queue. After this step, all remaining references on * sockets marked UNPGC_DEAD should not be part of any cycle. */ LIST_FOREACH(unp, &unp_deadhead, unp_dead) unp_gc_scan(unp, unp_remove_dead_ref); /* * If a socket still has a non-negative refcount, it cannot be in a * cycle. In this case increment refcount of all children iteratively. * Stop the scan once we do a complete loop without discovering * a new reachable socket. */ do { unp_marked = 0; LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp) if (unp->unp_gcrefs > 0) { unp->unp_gcflag &= ~UNPGC_DEAD; LIST_REMOVE(unp, unp_dead); KASSERT(unp_unreachable > 0, ("%s: unp_unreachable underflow.", __func__)); unp_unreachable--; unp_gc_scan(unp, unp_restore_undead_ref); } } while (unp_marked); UNP_LINK_RUNLOCK(); if (unp_unreachable == 0) return; /* * Allocate space for a local array of dead unpcbs. * TODO: can this path be simplified by instead using the local * dead list at unp_deadhead, after taking out references * on the file object and/or unpcb and dropping the link lock? */ unref = malloc(unp_unreachable * sizeof(struct file *), M_TEMP, M_WAITOK); /* * Iterate looking for sockets which have been specifically marked * as unreachable and store them locally. */ UNP_LINK_RLOCK(); total = 0; LIST_FOREACH(unp, &unp_deadhead, unp_dead) { KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0, ("%s: unp %p not marked UNPGC_DEAD", __func__, unp)); unp->unp_gcflag &= ~UNPGC_DEAD; f = unp->unp_file; if (unp->unp_msgcount == 0 || f == NULL || refcount_load(&f->f_count) != unp->unp_msgcount || !fhold(f)) continue; unref[total++] = f; KASSERT(total <= unp_unreachable, ("%s: incorrect unreachable count.", __func__)); } UNP_LINK_RUNLOCK(); /* * Now flush all sockets, free'ing rights. This will free the * struct files associated with these sockets but leave each socket * with one remaining ref. */ for (i = 0; i < total; i++) { struct socket *so; so = unref[i]->f_data; CURVNET_SET(so->so_vnet); sorflush(so); CURVNET_RESTORE(); } /* * And finally release the sockets so they can be reclaimed. */ for (i = 0; i < total; i++) fdrop(unref[i], NULL); unp_recycled += total; free(unref, M_TEMP); } /* * Synchronize against unp_gc, which can trip over data as we are freeing it. */ static void unp_dispose(struct socket *so) { struct sockbuf *sb; struct unpcb *unp; struct mbuf *m; MPASS(!SOLISTENING(so)); unp = sotounpcb(so); UNP_LINK_WLOCK(); unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS; UNP_LINK_WUNLOCK(); /* * Grab our special mbufs before calling sbrelease(). */ SOCK_RECVBUF_LOCK(so); switch (so->so_type) { case SOCK_DGRAM: while ((sb = TAILQ_FIRST(&so->so_rcv.uxdg_conns)) != NULL) { STAILQ_CONCAT(&so->so_rcv.uxdg_mb, &sb->uxdg_mb); TAILQ_REMOVE(&so->so_rcv.uxdg_conns, sb, uxdg_clist); /* Note: socket of sb may reconnect. */ sb->uxdg_cc = sb->uxdg_ctl = sb->uxdg_mbcnt = 0; } sb = &so->so_rcv; if (sb->uxdg_peeked != NULL) { STAILQ_INSERT_HEAD(&sb->uxdg_mb, sb->uxdg_peeked, m_stailqpkt); sb->uxdg_peeked = NULL; } m = STAILQ_FIRST(&sb->uxdg_mb); STAILQ_INIT(&sb->uxdg_mb); /* XXX: our shortened sbrelease() */ (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); /* * XXXGL Mark sb with SBS_CANTRCVMORE. This is needed to * prevent uipc_sosend_dgram() or unp_disconnect() adding more * data to the socket. * We are now in dom_dispose and it could be a call from * soshutdown() or from the final sofree(). The sofree() case * is simple as it guarantees that no more sends will happen, * however we can race with unp_disconnect() from our peer. * The shutdown(2) case is more exotic. It would call into * dom_dispose() only if socket is SS_ISCONNECTED. This is * possible if we did connect(2) on this socket and we also * had it bound with bind(2) and receive connections from other * sockets. Because soshutdown() violates POSIX (see comment * there) we will end up here shutting down our receive side. * Of course this will have affect not only on the peer we * connect(2)ed to, but also on all of the peers who had * connect(2)ed to us. Their sends would end up with ENOBUFS. */ sb->sb_state |= SBS_CANTRCVMORE; break; case SOCK_STREAM: case SOCK_SEQPACKET: sb = &so->so_rcv; m = sbcut_locked(sb, sb->sb_ccc); KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0, ("%s: ccc %u mb %p mbcnt %u", __func__, sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt)); sbrelease_locked(so, SO_RCV); break; } SOCK_RECVBUF_UNLOCK(so); if (SOCK_IO_RECV_OWNED(so)) SOCK_IO_RECV_UNLOCK(so); if (m != NULL) { unp_scan(m, unp_freerights); m_freem(m); } } static void unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int)) { struct mbuf *m; struct cmsghdr *cm; void *data; socklen_t clen, datalen; while (m0 != NULL) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) break; data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { (*op)(data, datalen / sizeof(struct filedescent *)); } if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } } m0 = m0->m_nextpkt; } } /* * Definitions of protocols supported in the LOCAL domain. */ static struct protosw streamproto = { .pr_type = SOCK_STREAM, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS| PR_CAPATTACH, .pr_ctloutput = &uipc_ctloutput, .pr_abort = uipc_abort, .pr_accept = uipc_accept, .pr_attach = uipc_attach, .pr_bind = uipc_bind, .pr_bindat = uipc_bindat, .pr_connect = uipc_connect, .pr_connectat = uipc_connectat, .pr_connect2 = uipc_connect2, .pr_detach = uipc_detach, .pr_disconnect = uipc_disconnect, .pr_listen = uipc_listen, .pr_peeraddr = uipc_peeraddr, .pr_rcvd = uipc_rcvd, .pr_send = uipc_send, .pr_ready = uipc_ready, .pr_sense = uipc_sense, .pr_shutdown = uipc_shutdown, .pr_sockaddr = uipc_sockaddr, .pr_soreceive = soreceive_generic, .pr_close = uipc_close, }; static struct protosw dgramproto = { .pr_type = SOCK_DGRAM, .pr_flags = PR_ATOMIC | PR_ADDR |PR_RIGHTS | PR_CAPATTACH | PR_SOCKBUF, .pr_ctloutput = &uipc_ctloutput, .pr_abort = uipc_abort, .pr_accept = uipc_accept, .pr_attach = uipc_attach, .pr_bind = uipc_bind, .pr_bindat = uipc_bindat, .pr_connect = uipc_connect, .pr_connectat = uipc_connectat, .pr_connect2 = uipc_connect2, .pr_detach = uipc_detach, .pr_disconnect = uipc_disconnect, .pr_peeraddr = uipc_peeraddr, .pr_sosend = uipc_sosend_dgram, .pr_sense = uipc_sense, .pr_shutdown = uipc_shutdown, .pr_sockaddr = uipc_sockaddr, .pr_soreceive = uipc_soreceive_dgram, .pr_close = uipc_close, }; static struct protosw seqpacketproto = { .pr_type = SOCK_SEQPACKET, /* * XXXRW: For now, PR_ADDR because soreceive will bump into them * due to our use of sbappendaddr. A new sbappend variants is needed * that supports both atomic record writes and control data. */ .pr_flags = PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED| PR_WANTRCVD|PR_RIGHTS|PR_CAPATTACH, .pr_ctloutput = &uipc_ctloutput, .pr_abort = uipc_abort, .pr_accept = uipc_accept, .pr_attach = uipc_attach, .pr_bind = uipc_bind, .pr_bindat = uipc_bindat, .pr_connect = uipc_connect, .pr_connectat = uipc_connectat, .pr_connect2 = uipc_connect2, .pr_detach = uipc_detach, .pr_disconnect = uipc_disconnect, .pr_listen = uipc_listen, .pr_peeraddr = uipc_peeraddr, .pr_rcvd = uipc_rcvd, .pr_send = uipc_send, .pr_sense = uipc_sense, .pr_shutdown = uipc_shutdown, .pr_sockaddr = uipc_sockaddr, .pr_soreceive = soreceive_generic, /* XXX: or...? */ .pr_close = uipc_close, }; static struct domain localdomain = { .dom_family = AF_LOCAL, .dom_name = "local", .dom_externalize = unp_externalize, .dom_dispose = unp_dispose, .dom_nprotosw = 3, .dom_protosw = { &streamproto, &dgramproto, &seqpacketproto, } }; DOMAIN_SET(local); /* * A helper function called by VFS before socket-type vnode reclamation. * For an active vnode it clears unp_vnode pointer and decrements unp_vnode * use count. */ void vfs_unp_reclaim(struct vnode *vp) { struct unpcb *unp; int active; struct mtx *vplock; ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim"); KASSERT(vp->v_type == VSOCK, ("vfs_unp_reclaim: vp->v_type != VSOCK")); active = 0; vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); VOP_UNP_CONNECT(vp, &unp); if (unp == NULL) goto done; UNP_PCB_LOCK(unp); if (unp->unp_vnode == vp) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; active = 1; } UNP_PCB_UNLOCK(unp); done: mtx_unlock(vplock); if (active) vunref(vp); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_unpflags(int unp_flags) { int comma; comma = 0; if (unp_flags & UNP_HAVEPC) { db_printf("%sUNP_HAVEPC", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_WANTCRED_ALWAYS) { db_printf("%sUNP_WANTCRED_ALWAYS", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_WANTCRED_ONESHOT) { db_printf("%sUNP_WANTCRED_ONESHOT", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_CONNWAIT) { db_printf("%sUNP_CONNWAIT", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_CONNECTING) { db_printf("%sUNP_CONNECTING", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_BINDING) { db_printf("%sUNP_BINDING", comma ? ", " : ""); comma = 1; } } static void db_print_xucred(int indent, struct xucred *xu) { int comma, i; db_print_indent(indent); db_printf("cr_version: %u cr_uid: %u cr_pid: %d cr_ngroups: %d\n", xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups); db_print_indent(indent); db_printf("cr_groups: "); comma = 0; for (i = 0; i < xu->cr_ngroups; i++) { db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]); comma = 1; } db_printf("\n"); } static void db_print_unprefs(int indent, struct unp_head *uh) { struct unpcb *unp; int counter; counter = 0; LIST_FOREACH(unp, uh, unp_reflink) { if (counter % 4 == 0) db_print_indent(indent); db_printf("%p ", unp); if (counter % 4 == 3) db_printf("\n"); counter++; } if (counter != 0 && counter % 4 != 0) db_printf("\n"); } DB_SHOW_COMMAND(unpcb, db_show_unpcb) { struct unpcb *unp; if (!have_addr) { db_printf("usage: show unpcb \n"); return; } unp = (struct unpcb *)addr; db_printf("unp_socket: %p unp_vnode: %p\n", unp->unp_socket, unp->unp_vnode); db_printf("unp_ino: %ju unp_conn: %p\n", (uintmax_t)unp->unp_ino, unp->unp_conn); db_printf("unp_refs:\n"); db_print_unprefs(2, &unp->unp_refs); /* XXXRW: Would be nice to print the full address, if any. */ db_printf("unp_addr: %p\n", unp->unp_addr); db_printf("unp_gencnt: %llu\n", (unsigned long long)unp->unp_gencnt); db_printf("unp_flags: %x (", unp->unp_flags); db_print_unpflags(unp->unp_flags); db_printf(")\n"); db_printf("unp_peercred:\n"); db_print_xucred(2, &unp->unp_peercred); db_printf("unp_refcount: %u\n", unp->unp_refcount); } #endif diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 9e25d6f8fefb..5f72506549bd 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -1,6135 +1,6117 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef INVARIANTS #include #endif #include #include #include #ifdef DDB #include #endif #include /* * High level overview of name caching in the VFS layer. * * Originally caching was implemented as part of UFS, later extracted to allow * use by other filesystems. A decision was made to make it optional and * completely detached from the rest of the kernel, which comes with limitations * outlined near the end of this comment block. * * This fundamental choice needs to be revisited. In the meantime, the current * state is described below. Significance of all notable routines is explained * in comments placed above their implementation. Scattered thoroughout the * file are TODO comments indicating shortcomings which can be fixed without * reworking everything (most of the fixes will likely be reusable). Various * details are omitted from this explanation to not clutter the overview, they * have to be checked by reading the code and associated commentary. * * Keep in mind that it's individual path components which are cached, not full * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries, * one for each name. * * I. Data organization * * Entries are described by "struct namecache" objects and stored in a hash * table. See cache_get_hash for more information. * * "struct vnode" contains pointers to source entries (names which can be found * when traversing through said vnode), destination entries (names of that * vnode (see "Limitations" for a breakdown on the subject) and a pointer to * the parent vnode. * * The (directory vnode; name) tuple reliably determines the target entry if * it exists. * * Since there are no small locks at this time (all are 32 bytes in size on * LP64), the code works around the problem by introducing lock arrays to * protect hash buckets and vnode lists. * * II. Filesystem integration * * Filesystems participating in name caching do the following: * - set vop_lookup routine to vfs_cache_lookup * - set vop_cachedlookup to whatever can perform the lookup if the above fails * - if they support lockless lookup (see below), vop_fplookup_vexec and * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the * mount point * - call cache_purge or cache_vop_* routines to eliminate stale entries as * applicable * - call cache_enter to add entries depending on the MAKEENTRY flag * * With the above in mind, there are 2 entry points when doing lookups: * - ... -> namei -> cache_fplookup -- this is the default * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei * should the above fail * * Example code flow how an entry is added: * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter * * III. Performance considerations * * For lockless case forward lookup avoids any writes to shared areas apart * from the terminal path component. In other words non-modifying lookups of * different files don't suffer any scalability problems in the namecache. * Looking up the same file is limited by VFS and goes beyond the scope of this * file. * * At least on amd64 the single-threaded bottleneck for long paths is hashing * (see cache_get_hash). There are cases where the code issues acquire fence * multiple times, they can be combined on architectures which suffer from it. * * For locked case each encountered vnode has to be referenced and locked in * order to be handed out to the caller (normally that's namei). This * introduces significant hit single-threaded and serialization multi-threaded. * * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached -- * avoids any writes to shared areas to any components. * * Unrelated insertions are partially serialized on updating the global entry * counter and possibly serialized on colliding bucket or vnode locks. * * IV. Observability * * Note not everything has an explicit dtrace probe nor it should have, thus * some of the one-liners below depend on implementation details. * * Examples: * * # Check what lookups failed to be handled in a lockless manner. Column 1 is * # line number, column 2 is status code (see cache_fpl_status) * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' * * # Lengths of names added by binary name * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' * * # Same as above but only those which exceed 64 characters * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }' * * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what * # path is it * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }' * * V. Limitations and implementation defects * * - since it is possible there is no entry for an open file, tools like * "procstat" may fail to resolve fd -> vnode -> path to anything * - even if a filesystem adds an entry, it may get purged (e.g., due to memory * shortage) in which case the above problem applies * - hardlinks are not tracked, thus if a vnode is reachable in more than one * way, resolving a name may return a different path than the one used to * open it (even if said path is still valid) * - by default entries are not added for newly created files * - adding an entry may need to evict negative entry first, which happens in 2 * distinct places (evicting on lookup, adding in a later VOP) making it * impossible to simply reuse it * - there is a simple scheme to evict negative entries as the cache is approaching * its capacity, but it is very unclear if doing so is a good idea to begin with * - vnodes are subject to being recycled even if target inode is left in memory, * which loses the name cache entries when it perhaps should not. in case of tmpfs * names get duplicated -- kept by filesystem itself and namecache separately * - struct namecache has a fixed size and comes in 2 variants, often wasting space. * now hard to replace with malloc due to dependence on SMR. * - lack of better integration with the kernel also turns nullfs into a layered * filesystem instead of something which can take advantage of caching */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache"); SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", "const char *"); SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", "struct namecache *", "int", "int"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", "struct componentname *"); SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", "struct componentname *"); SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t"); SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); SDT_PROBE_DECLARE(vfs, namei, lookup, entry); SDT_PROBE_DECLARE(vfs, namei, lookup, return); static char __read_frequently cache_fast_lookup_enabled = true; /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct negstate { u_char neg_flag; u_char neg_hit; }; _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), "the state must fit in a union with a pointer without growing it"); struct namecache { LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ struct vnode *nc_dvp; /* vnode of parent of name */ union { struct vnode *nu_vp; /* vnode the name refers to */ struct negstate nu_neg;/* negative entry state */ } n_un; u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[]; /* segment name + nul */ }; /* * struct namecache_ts repeats struct namecache layout up to the * nc_nlen member. * struct namecache_ts is used in place of struct namecache when time(s) need * to be stored. The nc_dotdottime field is used when a cache entry is mapping * both a non-dotdot directory name plus dotdot for the directory's * parent. * * See below for alignment requirement. */ struct namecache_ts { struct timespec nc_time; /* timespec provided by fs */ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ int nc_ticks; /* ticks value when entry was added */ int nc_pad; struct namecache nc_nc; }; TAILQ_HEAD(cache_freebatch, namecache); /* * At least mips n32 performs 64-bit accesses to timespec as found * in namecache_ts and requires them to be aligned. Since others * may be in the same spot suffer a little bit and enforce the * alignment for everyone. Note this is a nop for 64-bit platforms. */ #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) /* * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the * 4.4 BSD codebase. Later on struct namecache was tweaked to become * smaller and the value was bumped to retain the total size, but it * was never re-evaluated for suitability. A simple test counting * lengths during package building shows that the value of 45 covers * about 86% of all added entries, reaching 99% at 65. * * Regardless of the above, use of dedicated zones instead of malloc may be * inducing additional waste. This may be hard to address as said zones are * tied to VFS SMR. Even if retaining them, the current split should be * re-evaluated. */ #ifdef __LP64__ #define CACHE_PATH_CUTOFF 45 #define CACHE_LARGE_PAD 6 #else #define CACHE_PATH_CUTOFF 41 #define CACHE_LARGE_PAD 2 #endif #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); #define nc_vp n_un.nu_vp #define nc_neg n_un.nu_neg /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #define NCF_TS 0x04 #define NCF_DTS 0x08 #define NCF_DVDROP 0x10 #define NCF_NEGATIVE 0x20 #define NCF_INVALID 0x40 #define NCF_WIP 0x80 /* * Flags in negstate.neg_flag */ #define NEG_HOT 0x01 static bool cache_neg_evict_cond(u_long lnumcache); /* * Mark an entry as invalid. * * This is called before it starts getting deconstructed. */ static void cache_ncp_invalidate(struct namecache *ncp) { KASSERT((ncp->nc_flag & NCF_INVALID) == 0, ("%s: entry %p already invalid", __func__, ncp)); atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); atomic_thread_fence_rel(); } /* * Check whether the entry can be safely used. * * All places which elide locks are supposed to call this after they are * done with reading from an entry. */ #define cache_ncp_canuse(ncp) ({ \ struct namecache *_ncp = (ncp); \ u_char _nc_flag; \ \ atomic_thread_fence_acq(); \ _nc_flag = atomic_load_char(&_ncp->nc_flag); \ __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ }) /* * Like the above but also checks NCF_WHITE. */ #define cache_fpl_neg_ncp_canuse(ncp) ({ \ struct namecache *_ncp = (ncp); \ u_char _nc_flag; \ \ atomic_thread_fence_acq(); \ _nc_flag = atomic_load_char(&_ncp->nc_flag); \ __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ }) VFS_SMR_DECLARE; static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache parameters"); static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, "Total namecache capacity"); u_int ncsizefactor = 2; SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); /* * Negative entry % of namecache capacity above which automatic eviction is allowed. * * Check cache_neg_evict_cond for details. */ static u_int ncnegminpct = 3; static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, "Negative entry count above which automatic eviction is allowed"); /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ static u_long __read_mostly nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ struct nchstats nchstats; /* cache effectiveness statistics */ static u_int __exclusive_cache_line neg_cycle; #define ncneghash 3 #define numneglists (ncneghash + 1) struct neglist { struct mtx nl_evict_lock; struct mtx nl_lock __aligned(CACHE_LINE_SIZE); TAILQ_HEAD(, namecache) nl_list; TAILQ_HEAD(, namecache) nl_hotlist; u_long nl_hotnum; } __aligned(CACHE_LINE_SIZE); static struct neglist neglists[numneglists]; static inline struct neglist * NCP2NEGLIST(struct namecache *ncp) { return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); } static inline struct negstate * NCP2NEGSTATE(struct namecache *ncp) { MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); return (&ncp->nc_neg); } #define numbucketlocks (ncbuckethash + 1) static u_int __read_mostly ncbuckethash; static struct mtx_padalign __read_mostly *bucketlocks; #define HASH2BUCKETLOCK(hash) \ ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) #define numvnodelocks (ncvnodehash + 1) static u_int __read_mostly ncvnodehash; static struct mtx __read_mostly *vnodelocks; static inline struct mtx * VP2VNODELOCK(struct vnode *vp) { return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); } static void cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; KASSERT((ncp->nc_flag & NCF_TS) != 0 || (tsp == NULL && ticksp == NULL), ("No NCF_TS")); if (tsp == NULL) return; ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_time; *ticksp = ncp_ts->nc_ticks; } #ifdef DEBUG_CACHE static int __read_mostly doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); #endif /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct namecache), "sizeof(struct namecache)"); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache statistics"); #define STATNODE_ULONG(name, varname, descr) \ SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); #define STATNODE_COUNTER(name, varname, descr) \ static COUNTER_U64_DEFINE_EARLY(varname); \ SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ descr); STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); STATNODE_ULONG(count, numcache, "Number of cache entries"); STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); STATNODE_COUNTER(posszaps, numposzaps, "Number of cache hits (positive) we do not want to cache"); STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); STATNODE_COUNTER(negzaps, numnegzaps, "Number of cache hits (negative) we do not want to cache"); STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); /* These count for vn_getcwd(), too. */ STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); STATNODE_COUNTER(fullpathfail2, numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); /* * Debug or developer statistics. */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache debugging"); #define DEBUGNODE_ULONG(name, varname, descr) \ SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); #define DEBUGNODE_COUNTER(name, varname, descr) \ static COUNTER_U64_DEFINE_EARLY(varname); \ SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ descr); DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, "Number of successful removals after relocking"); static long zap_bucket_fail; DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); static long zap_bucket_fail2; DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); static long cache_lock_vnodes_cel_3_failures; DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, "Number of times 3-way vnode locking failed"); static void cache_zap_locked(struct namecache *ncp); static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen, size_t addend); static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen); static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, size_t addend); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); static inline void cache_assert_vlp_locked(struct mtx *vlp) { if (vlp != NULL) mtx_assert(vlp, MA_OWNED); } static inline void cache_assert_vnode_locked(struct vnode *vp) { struct mtx *vlp; vlp = VP2VNODELOCK(vp); cache_assert_vlp_locked(vlp); } /* * Directory vnodes with entries are held for two reasons: * 1. make them less of a target for reclamation in vnlru * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided * * It will be feasible to stop doing it altogether if all filesystems start * supporting lockless lookup. */ static void cache_hold_vnode(struct vnode *vp) { cache_assert_vnode_locked(vp); VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); vhold(vp); counter_u64_add(numcachehv, 1); } static void cache_drop_vnode(struct vnode *vp) { /* * Called after all locks are dropped, meaning we can't assert * on the state of v_cache_src. */ vdrop(vp); counter_u64_add(numcachehv, -1); } /* * UMA zones. */ static uma_zone_t __read_mostly cache_zone_small; static uma_zone_t __read_mostly cache_zone_small_ts; static uma_zone_t __read_mostly cache_zone_large; static uma_zone_t __read_mostly cache_zone_large_ts; char * cache_symlink_alloc(size_t size, int flags) { if (size < CACHE_ZONE_SMALL_SIZE) { return (uma_zalloc_smr(cache_zone_small, flags)); } if (size < CACHE_ZONE_LARGE_SIZE) { return (uma_zalloc_smr(cache_zone_large, flags)); } counter_u64_add(symlinktoobig, 1); SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); return (NULL); } void cache_symlink_free(char *string, size_t size) { MPASS(string != NULL); KASSERT(size < CACHE_ZONE_LARGE_SIZE, ("%s: size %zu too big", __func__, size)); if (size < CACHE_ZONE_SMALL_SIZE) { uma_zfree_smr(cache_zone_small, string); return; } if (size < CACHE_ZONE_LARGE_SIZE) { uma_zfree_smr(cache_zone_large, string); return; } __assert_unreachable(); } static struct namecache * cache_alloc_uma(int len, bool ts) { struct namecache_ts *ncp_ts; struct namecache *ncp; if (__predict_false(ts)) { if (len <= CACHE_PATH_CUTOFF) ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); else ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); ncp = &ncp_ts->nc_nc; } else { if (len <= CACHE_PATH_CUTOFF) ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); else ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); } return (ncp); } static void cache_free_uma(struct namecache *ncp) { struct namecache_ts *ncp_ts; if (__predict_false(ncp->nc_flag & NCF_TS)) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small_ts, ncp_ts); else uma_zfree_smr(cache_zone_large_ts, ncp_ts); } else { if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small, ncp); else uma_zfree_smr(cache_zone_large, ncp); } } static struct namecache * cache_alloc(int len, bool ts) { u_long lnumcache; /* * Avoid blowout in namecache entries. * * Bugs: * 1. filesystems may end up trying to add an already existing entry * (for example this can happen after a cache miss during concurrent * lookup), in which case we will call cache_neg_evict despite not * adding anything. * 2. the routine may fail to free anything and no provisions are made * to make it try harder (see the inside for failure modes) * 3. it only ever looks at negative entries. */ lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; if (cache_neg_evict_cond(lnumcache)) { lnumcache = atomic_load_long(&numcache); } if (__predict_false(lnumcache >= ncsize)) { atomic_subtract_long(&numcache, 1); counter_u64_add(numdrops, 1); return (NULL); } return (cache_alloc_uma(len, ts)); } static void cache_free(struct namecache *ncp) { MPASS(ncp != NULL); if ((ncp->nc_flag & NCF_DVDROP) != 0) { cache_drop_vnode(ncp->nc_dvp); } cache_free_uma(ncp); atomic_subtract_long(&numcache, 1); } static void cache_free_batch(struct cache_freebatch *batch) { struct namecache *ncp, *nnp; int i; i = 0; if (TAILQ_EMPTY(batch)) goto out; TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { if ((ncp->nc_flag & NCF_DVDROP) != 0) { cache_drop_vnode(ncp->nc_dvp); } cache_free_uma(ncp); i++; } atomic_subtract_long(&numcache, i); out: SDT_PROBE1(vfs, namecache, purge, batch, i); } /* * Hashing. * * The code was made to use FNV in 2001 and this choice needs to be revisited. * * Short summary of the difficulty: * The longest name which can be inserted is NAME_MAX characters in length (or * 255 at the time of writing this comment), while majority of names used in * practice are significantly shorter (mostly below 10). More importantly * majority of lookups performed find names are even shorter than that. * * This poses a problem where hashes which do better than FNV past word size * (or so) tend to come with additional overhead when finalizing the result, * making them noticeably slower for the most commonly used range. * * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c * * When looking it up the most time consuming part by a large margin (at least * on amd64) is hashing. Replacing FNV with something which pessimizes short * input would make the slowest part stand out even more. */ /* * TODO: With the value stored we can do better than computing the hash based * on the address. */ static void cache_prehash(struct vnode *vp) { vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); } static uint32_t cache_get_hash(char *name, u_char len, struct vnode *dvp) { return (fnv_32_buf(name, len, dvp->v_nchash)); } static uint32_t cache_get_hash_iter_start(struct vnode *dvp) { return (dvp->v_nchash); } static uint32_t cache_get_hash_iter(char c, uint32_t hash) { return (fnv_32_buf(&c, 1, hash)); } static uint32_t cache_get_hash_iter_finish(uint32_t hash) { return (hash); } static inline struct nchashhead * NCP2BUCKET(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (NCHHASH(hash)); } static inline struct mtx * NCP2BUCKETLOCK(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (HASH2BUCKETLOCK(hash)); } #ifdef INVARIANTS static void cache_assert_bucket_locked(struct namecache *ncp) { struct mtx *blp; blp = NCP2BUCKETLOCK(ncp); mtx_assert(blp, MA_OWNED); } static void cache_assert_bucket_unlocked(struct namecache *ncp) { struct mtx *blp; blp = NCP2BUCKETLOCK(ncp); mtx_assert(blp, MA_NOTOWNED); } #else #define cache_assert_bucket_locked(x) do { } while (0) #define cache_assert_bucket_unlocked(x) do { } while (0) #endif #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) static void _cache_sort_vnodes(void **p1, void **p2) { void *tmp; MPASS(*p1 != NULL || *p2 != NULL); if (*p1 > *p2) { tmp = *p2; *p2 = *p1; *p1 = tmp; } } static void cache_lock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) mtx_lock(&bucketlocks[i]); } static void cache_unlock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) mtx_unlock(&bucketlocks[i]); } static void cache_lock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_lock(&vnodelocks[i]); } static void cache_unlock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_unlock(&vnodelocks[i]); } static int cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { if (!mtx_trylock(vlp1)) return (EAGAIN); } if (!mtx_trylock(vlp2)) { if (vlp1 != NULL) mtx_unlock(vlp1); return (EAGAIN); } return (0); } static void cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); MPASS(vlp1 <= vlp2); if (vlp1 != NULL) mtx_lock(vlp1); if (vlp2 != NULL) mtx_lock(vlp2); } static void cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); } static int sysctl_nchstats(SYSCTL_HANDLER_ARGS) { struct nchstats snap; if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, sizeof(snap))); snap = nchstats; snap.ncs_goodhits = counter_u64_fetch(numposhits); snap.ncs_neghits = counter_u64_fetch(numneghits); snap.ncs_badhits = counter_u64_fetch(numposzaps) + counter_u64_fetch(numnegzaps); snap.ncs_miss = counter_u64_fetch(nummisszap) + counter_u64_fetch(nummiss); return (SYSCTL_OUT(req, &snap, sizeof(snap))); } SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", "VFS cache effectiveness statistics"); static void cache_recalc_neg_min(u_int val) { neg_min = (ncsize * val) / 100; } static int sysctl_negminpct(SYSCTL_HANDLER_ARGS) { u_int val; int error; val = ncnegminpct; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == ncnegminpct) return (0); if (val < 0 || val > 99) return (EINVAL); ncnegminpct = val; cache_recalc_neg_min(val); return (0); } SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); #ifdef DEBUG_CACHE /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { struct nchashhead *ncpp; struct namecache *ncp; int i, error, n_nchash, *cntbuf; retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (req->oldptr == NULL) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); cache_lock_all_buckets(); if (n_nchash != nchash + 1) { cache_unlock_all_buckets(); free(cntbuf, M_TEMP); goto retry; } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) CK_SLIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) break; free(cntbuf, M_TEMP); return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); cache_lock_all_buckets(); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; cache_unlock_all_buckets(); pct = (used * 100) / (n_nchash / 100); error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); #endif /* * Negative entries management * * Various workloads create plenty of negative entries and barely use them * afterwards. Moreover malicious users can keep performing bogus lookups * adding even more entries. For example "make tinderbox" as of writing this * comment ends up with 2.6M namecache entries in total, 1.2M of which are * negative. * * As such, a rather aggressive eviction method is needed. The currently * employed method is a placeholder. * * Entries are split over numneglists separate lists, each of which is further * split into hot and cold entries. Entries get promoted after getting a hit. * Eviction happens on addition of new entry. */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache negative entry statistics"); SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, "Number of negative cache entries"); static COUNTER_U64_DEFINE_EARLY(neg_created); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, "Number of created negative entries"); static COUNTER_U64_DEFINE_EARLY(neg_evicted); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, "Number of evicted negative entries"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, &neg_evict_skipped_empty, "Number of times evicting failed due to lack of entries"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, &neg_evict_skipped_missed, "Number of times evicting failed due to target entry disappearing"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, &neg_evict_skipped_contended, "Number of times evicting failed due to contention"); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, "Number of cache hits (negative)"); static int sysctl_neg_hot(SYSCTL_HANDLER_ARGS) { int i, out; out = 0; for (i = 0; i < numneglists; i++) out += neglists[i].nl_hotnum; return (SYSCTL_OUT(req, &out, sizeof(out))); } SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", "Number of hot negative entries"); static void cache_neg_init(struct namecache *ncp) { struct negstate *ns; ncp->nc_flag |= NCF_NEGATIVE; ns = NCP2NEGSTATE(ncp); ns->neg_flag = 0; ns->neg_hit = 0; counter_u64_add(neg_created, 1); } #define CACHE_NEG_PROMOTION_THRESH 2 static bool cache_neg_hit_prep(struct namecache *ncp) { struct negstate *ns; u_char n; ns = NCP2NEGSTATE(ncp); n = atomic_load_char(&ns->neg_hit); for (;;) { if (n >= CACHE_NEG_PROMOTION_THRESH) return (false); if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) break; } return (n + 1 == CACHE_NEG_PROMOTION_THRESH); } /* * Nothing to do here but it is provided for completeness as some * cache_neg_hit_prep callers may end up returning without even * trying to promote. */ #define cache_neg_hit_abort(ncp) do { } while (0) static void cache_neg_hit_finish(struct namecache *ncp) { SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); counter_u64_add(numneghits, 1); } /* * Move a negative entry to the hot list. */ static void cache_neg_promote_locked(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; ns = NCP2NEGSTATE(ncp); nl = NCP2NEGLIST(ncp); mtx_assert(&nl->nl_lock, MA_OWNED); if ((ns->neg_flag & NEG_HOT) == 0) { TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); nl->nl_hotnum++; ns->neg_flag |= NEG_HOT; } } /* * Move a hot negative entry to the cold list. */ static void cache_neg_demote_locked(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; ns = NCP2NEGSTATE(ncp); nl = NCP2NEGLIST(ncp); mtx_assert(&nl->nl_lock, MA_OWNED); MPASS(ns->neg_flag & NEG_HOT); TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); nl->nl_hotnum--; ns->neg_flag &= ~NEG_HOT; atomic_store_char(&ns->neg_hit, 0); } /* * Move a negative entry to the hot list if it matches the lookup. * * We have to take locks, but they may be contended and in the worst * case we may need to go off CPU. We don't want to spin within the * smr section and we can't block with it. Exiting the section means * the found entry could have been evicted. We are going to look it * up again. */ static bool cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, struct namecache *oncp, uint32_t hash) { struct namecache *ncp; struct neglist *nl; u_char nc_flag; nl = NCP2NEGLIST(oncp); mtx_lock(&nl->nl_lock); /* * For hash iteration. */ vfs_smr_enter(); /* * Avoid all surprises by only succeeding if we got the same entry and * bailing completely otherwise. * XXX There are no provisions to keep the vnode around, meaning we may * end up promoting a negative entry for a *new* vnode and returning * ENOENT on its account. This is the error we want to return anyway * and promotion is harmless. * * In particular at this point there can be a new ncp which matches the * search but hashes to a different neglist. */ CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp == oncp) break; } /* * No match to begin with. */ if (__predict_false(ncp == NULL)) { goto out_abort; } /* * The newly found entry may be something different... */ if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { goto out_abort; } /* * ... and not even negative. */ nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) == 0) { goto out_abort; } if (!cache_ncp_canuse(ncp)) { goto out_abort; } cache_neg_promote_locked(ncp); cache_neg_hit_finish(ncp); vfs_smr_exit(); mtx_unlock(&nl->nl_lock); return (true); out_abort: vfs_smr_exit(); mtx_unlock(&nl->nl_lock); return (false); } static void cache_neg_promote(struct namecache *ncp) { struct neglist *nl; nl = NCP2NEGLIST(ncp); mtx_lock(&nl->nl_lock); cache_neg_promote_locked(ncp); mtx_unlock(&nl->nl_lock); } static void cache_neg_insert(struct namecache *ncp) { struct neglist *nl; MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_bucket_locked(ncp); nl = NCP2NEGLIST(ncp); mtx_lock(&nl->nl_lock); TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); mtx_unlock(&nl->nl_lock); atomic_add_long(&numneg, 1); } static void cache_neg_remove(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; cache_assert_bucket_locked(ncp); nl = NCP2NEGLIST(ncp); ns = NCP2NEGSTATE(ncp); mtx_lock(&nl->nl_lock); if ((ns->neg_flag & NEG_HOT) != 0) { TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); nl->nl_hotnum--; } else { TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); } mtx_unlock(&nl->nl_lock); atomic_subtract_long(&numneg, 1); } static struct neglist * cache_neg_evict_select_list(void) { struct neglist *nl; u_int c; c = atomic_fetchadd_int(&neg_cycle, 1) + 1; nl = &neglists[c % numneglists]; if (!mtx_trylock(&nl->nl_evict_lock)) { counter_u64_add(neg_evict_skipped_contended, 1); return (NULL); } return (nl); } static struct namecache * cache_neg_evict_select_entry(struct neglist *nl) { struct namecache *ncp, *lncp; struct negstate *ns, *lns; int i; mtx_assert(&nl->nl_evict_lock, MA_OWNED); mtx_assert(&nl->nl_lock, MA_OWNED); ncp = TAILQ_FIRST(&nl->nl_list); if (ncp == NULL) return (NULL); lncp = ncp; lns = NCP2NEGSTATE(lncp); for (i = 1; i < 4; i++) { ncp = TAILQ_NEXT(ncp, nc_dst); if (ncp == NULL) break; ns = NCP2NEGSTATE(ncp); if (ns->neg_hit < lns->neg_hit) { lncp = ncp; lns = ns; } } return (lncp); } static bool cache_neg_evict(void) { struct namecache *ncp, *ncp2; struct neglist *nl; struct vnode *dvp; struct mtx *dvlp; struct mtx *blp; uint32_t hash; u_char nlen; bool evicted; nl = cache_neg_evict_select_list(); if (nl == NULL) { return (false); } mtx_lock(&nl->nl_lock); ncp = TAILQ_FIRST(&nl->nl_hotlist); if (ncp != NULL) { cache_neg_demote_locked(ncp); } ncp = cache_neg_evict_select_entry(nl); if (ncp == NULL) { counter_u64_add(neg_evict_skipped_empty, 1); mtx_unlock(&nl->nl_lock); mtx_unlock(&nl->nl_evict_lock); return (false); } nlen = ncp->nc_nlen; dvp = ncp->nc_dvp; hash = cache_get_hash(ncp->nc_name, nlen, dvp); dvlp = VP2VNODELOCK(dvp); blp = HASH2BUCKETLOCK(hash); mtx_unlock(&nl->nl_lock); mtx_unlock(&nl->nl_evict_lock); mtx_lock(dvlp); mtx_lock(blp); /* * Note that since all locks were dropped above, the entry may be * gone or reallocated to be something else. */ CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { if (ncp2 == ncp && ncp2->nc_dvp == dvp && ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) break; } if (ncp2 == NULL) { counter_u64_add(neg_evict_skipped_missed, 1); ncp = NULL; evicted = false; } else { MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); MPASS(blp == NCP2BUCKETLOCK(ncp)); SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, ncp->nc_name); cache_zap_locked(ncp); counter_u64_add(neg_evicted, 1); evicted = true; } mtx_unlock(blp); mtx_unlock(dvlp); if (ncp != NULL) cache_free(ncp); return (evicted); } /* * Maybe evict a negative entry to create more room. * * The ncnegfactor parameter limits what fraction of the total count * can comprise of negative entries. However, if the cache is just * warming up this leads to excessive evictions. As such, ncnegminpct * (recomputed to neg_min) dictates whether the above should be * applied. * * Try evicting if the cache is close to full capacity regardless of * other considerations. */ static bool cache_neg_evict_cond(u_long lnumcache) { u_long lnumneg; if (ncsize - 1000 < lnumcache) goto out_evict; lnumneg = atomic_load_long(&numneg); if (lnumneg < neg_min) return (false); if (lnumneg * ncnegfactor < lnumcache) return (false); out_evict: return (cache_neg_evict()); } /* * cache_zap_locked(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap_locked(struct namecache *ncp) { struct nchashhead *ncpp; struct vnode *dvp, *vp; dvp = ncp->nc_dvp; vp = ncp->nc_vp; if (!(ncp->nc_flag & NCF_NEGATIVE)) cache_assert_vnode_locked(vp); cache_assert_vnode_locked(dvp); cache_assert_bucket_locked(ncp); cache_ncp_invalidate(ncp); ncpp = NCP2BUCKET(ncp); CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); if (!(ncp->nc_flag & NCF_NEGATIVE)) { SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); if (ncp == vp->v_cache_dd) { atomic_store_ptr(&vp->v_cache_dd, NULL); } } else { SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); cache_neg_remove(ncp); } if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == dvp->v_cache_dd) { atomic_store_ptr(&dvp->v_cache_dd, NULL); } } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&dvp->v_cache_src)) { ncp->nc_flag |= NCF_DVDROP; } } } static void cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) { struct mtx *blp; MPASS(ncp->nc_dvp == vp); MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_vnode_locked(vp); blp = NCP2BUCKETLOCK(ncp); mtx_lock(blp); cache_zap_locked(ncp); mtx_unlock(blp); } static bool cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, struct mtx **vlpp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct mtx *blp; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); if (ncp->nc_flag & NCF_NEGATIVE) { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_zap_negative_locked_vnode_kl(ncp, vp); return (true); } pvlp = VP2VNODELOCK(vp); blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); if (*vlpp == vlp1 || *vlpp == vlp2) { to_unlock = *vlpp; *vlpp = NULL; } else { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) goto out_relock; to_unlock = vlp1; } } mtx_lock(blp); cache_zap_locked(ncp); mtx_unlock(blp); if (to_unlock != NULL) mtx_unlock(to_unlock); return (true); out_relock: mtx_unlock(vlp2); mtx_lock(vlp1); mtx_lock(vlp2); MPASS(*vlpp == NULL); *vlpp = vlp1; return (false); } /* * If trylocking failed we can get here. We know enough to take all needed locks * in the right order and re-lookup the entry. */ static int cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, struct mtx *blp) { struct namecache *rncp; cache_assert_bucket_unlocked(ncp); cache_sort_vnodes(&dvlp, &vlp); cache_lock_vnodes(dvlp, vlp); mtx_lock(blp); CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { if (rncp == ncp && rncp->nc_dvp == dvp && rncp->nc_nlen == cnp->cn_namelen && !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) break; } if (rncp != NULL) { cache_zap_locked(rncp); mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); counter_u64_add(zap_bucket_relock_success, 1); return (0); } mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); return (EAGAIN); } static int __noinline cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, uint32_t hash, struct mtx *blp) { struct mtx *dvlp, *vlp; struct vnode *dvp; cache_assert_bucket_locked(ncp); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = NULL; if (!(ncp->nc_flag & NCF_NEGATIVE)) vlp = VP2VNODELOCK(ncp->nc_vp); if (cache_trylock_vnodes(dvlp, vlp) == 0) { cache_zap_locked(ncp); mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); return (0); } dvp = ncp->nc_dvp; mtx_unlock(blp); return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); } static __noinline int cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) { struct namecache *ncp; struct mtx *blp; struct mtx *dvlp, *dvlp2; uint32_t hash; int error; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { dvlp = VP2VNODELOCK(dvp); dvlp2 = NULL; mtx_lock(dvlp); retry_dotdot: ncp = dvp->v_cache_dd; if (ncp == NULL) { mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) goto retry_dotdot; MPASS(dvp->v_cache_dd == NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); cache_free(ncp); } else { atomic_store_ptr(&dvp->v_cache_dd, NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); } SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); return (1); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); retry: if (CK_SLIST_EMPTY(NCHHASH(hash))) goto out_no_entry; mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (ncp == NULL) { mtx_unlock(blp); goto out_no_entry; } error = cache_zap_locked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { zap_bucket_fail++; goto retry; } counter_u64_add(numposzaps, 1); SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); cache_free(ncp); return (1); out_no_entry: counter_u64_add(nummisszap, 1); SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); return (0); } static int __noinline cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { int ltype; *vpp = dvp; counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); if (tsp != NULL) timespecclear(tsp); if (ticksp != NULL) *ticksp = ticks; vrefact(*vpp); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED((*vpp))) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } static int __noinline cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; struct namecache *ncp; struct mtx *dvlp; enum vgetstate vs; int error, ltype; bool whiteout; MPASS((cnp->cn_flags & ISDOTDOT) != 0); if ((cnp->cn_flags & MAKEENTRY) == 0) { cache_remove_cnp(dvp, cnp); return (0); } counter_u64_add(dotdothits, 1); retry: dvlp = VP2VNODELOCK(dvp); mtx_lock(dvlp); ncp = dvp->v_cache_dd; if (ncp == NULL) { SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); mtx_unlock(dvlp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (ncp->nc_flag & NCF_NEGATIVE) *vpp = NULL; else *vpp = ncp->nc_vp; } else *vpp = ncp->nc_dvp; if (*vpp == NULL) goto negative_success; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); cache_out_ts(ncp, tsp, ticksp); if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == NCF_DTS && tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_dotdottime; } MPASS(dvp != *vpp); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); vs = vget_prep(*vpp); mtx_unlock(dvlp); error = vget_finish(*vpp, cnp->cn_lkflags, vs); vn_lock(dvp, ltype | LK_RETRY); if (VN_IS_DOOMED(dvp)) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } if (error) { *vpp = NULL; goto retry; } return (-1); negative_success: if (__predict_false(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { counter_u64_add(numnegzaps, 1); cache_zap_negative_locked_vnode_kl(ncp, dvp); mtx_unlock(dvlp); cache_free(ncp); return (0); } } whiteout = (ncp->nc_flag & NCF_WHITE); cache_out_ts(ncp, tsp, ticksp); if (cache_neg_hit_prep(ncp)) cache_neg_promote(ncp); else cache_neg_hit_finish(ncp); mtx_unlock(dvlp); if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } /** * Lookup a name in the name cache * * # Arguments * * - dvp: Parent directory in which to search. * - vpp: Return argument. Will contain desired vnode on cache hit. * - cnp: Parameters of the name search. The most interesting bits of * the cn_flags field have the following meanings: * - MAKEENTRY: If clear, free an entry from the cache rather than look * it up. * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." * - tsp: Return storage for cache timestamp. On a successful (positive * or negative) lookup, tsp will be filled with any timespec that * was stored when this cache entry was created. However, it will * be clear for "." entries. * - ticks: Return storage for alternate cache timestamp. On a successful * (positive or negative) lookup, it will contain the ticks value * that was current when the cache entry was created, unless cnp * was ".". * * Either both tsp and ticks have to be provided or neither of them. * * # Returns * * - -1: A positive cache hit. vpp will contain the desired vnode. * - ENOENT: A negative cache hit, or dvp was recycled out from under us due * to a forced unmount. vpp will not be modified. If the entry * is a whiteout, then the ISWHITEOUT flag will be set in * cnp->cn_flags. * - 0: A cache miss. vpp will not be modified. * * # Locking * * On a cache hit, vpp will be returned locked and ref'd. If we're looking up * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the * lock is not recursively acquired. */ static int __noinline cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; struct mtx *blp; uint32_t hash; enum vgetstate vs; int error; bool whiteout; MPASS((cnp->cn_flags & ISDOTDOT) == 0); MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); retry: hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { mtx_unlock(blp); SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); counter_u64_add(nummiss, 1); return (0); } if (ncp->nc_flag & NCF_NEGATIVE) goto negative_success; counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); MPASS(dvp != *vpp); vs = vget_prep(*vpp); mtx_unlock(blp); error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (error) { *vpp = NULL; goto retry; } return (-1); negative_success: /* * We don't get here with regular lookup apart from corner cases. */ if (__predict_true(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { counter_u64_add(numnegzaps, 1); error = cache_zap_locked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { zap_bucket_fail2++; goto retry; } cache_free(ncp); return (0); } } whiteout = (ncp->nc_flag & NCF_WHITE); cache_out_ts(ncp, tsp, ticksp); if (cache_neg_hit_prep(ncp)) cache_neg_promote(ncp); else cache_neg_hit_finish(ncp); mtx_unlock(blp); if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; uint32_t hash; enum vgetstate vs; int error; bool whiteout, neg_promote; u_short nc_flag; MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) { cnp->cn_flags &= ~MAKEENTRY; return (0); } #endif if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); } MPASS((cnp->cn_flags & ISDOTDOT) == 0); if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { cache_remove_cnp(dvp, cnp); return (0); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); vfs_smr_enter(); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { vfs_smr_exit(); SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); counter_u64_add(nummiss, 1); return (0); } nc_flag = atomic_load_char(&ncp->nc_flag); if (nc_flag & NCF_NEGATIVE) goto negative_success; counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); MPASS(dvp != *vpp); if (!cache_ncp_canuse(ncp)) { vfs_smr_exit(); *vpp = NULL; goto out_fallback; } vs = vget_prep_smr(*vpp); vfs_smr_exit(); if (__predict_false(vs == VGET_NONE)) { *vpp = NULL; goto out_fallback; } error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (error) { *vpp = NULL; goto out_fallback; } return (-1); negative_success: if (cnp->cn_nameiop == CREATE) { if (cnp->cn_flags & ISLASTCN) { vfs_smr_exit(); goto out_fallback; } } cache_out_ts(ncp, tsp, ticksp); whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); neg_promote = cache_neg_hit_prep(ncp); if (!cache_ncp_canuse(ncp)) { cache_neg_hit_abort(ncp); vfs_smr_exit(); goto out_fallback; } if (neg_promote) { vfs_smr_exit(); if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) goto out_fallback; } else { cache_neg_hit_finish(ncp); vfs_smr_exit(); } if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); out_fallback: return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); } struct celockstate { struct mtx *vlp[3]; struct mtx *blp[2]; }; CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); static inline void cache_celockstate_init(struct celockstate *cel) { bzero(cel, sizeof(*cel)); } static void cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, struct vnode *dvp) { struct mtx *vlp1, *vlp2; MPASS(cel->vlp[0] == NULL); MPASS(cel->vlp[1] == NULL); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL || dvp != NULL); vlp1 = VP2VNODELOCK(vp); vlp2 = VP2VNODELOCK(dvp); cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { mtx_lock(vlp1); cel->vlp[0] = vlp1; } mtx_lock(vlp2); cel->vlp[1] = vlp2; } static void cache_unlock_vnodes_cel(struct celockstate *cel) { MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); if (cel->vlp[0] != NULL) mtx_unlock(cel->vlp[0]); if (cel->vlp[1] != NULL) mtx_unlock(cel->vlp[1]); if (cel->vlp[2] != NULL) mtx_unlock(cel->vlp[2]); } static bool cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) { struct mtx *vlp; bool ret; cache_assert_vlp_locked(cel->vlp[0]); cache_assert_vlp_locked(cel->vlp[1]); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL); vlp = VP2VNODELOCK(vp); ret = true; if (vlp >= cel->vlp[1]) { mtx_lock(vlp); } else { if (mtx_trylock(vlp)) goto out; cache_lock_vnodes_cel_3_failures++; cache_unlock_vnodes_cel(cel); if (vlp < cel->vlp[0]) { mtx_lock(vlp); mtx_lock(cel->vlp[0]); mtx_lock(cel->vlp[1]); } else { if (cel->vlp[0] != NULL) mtx_lock(cel->vlp[0]); mtx_lock(vlp); mtx_lock(cel->vlp[1]); } ret = false; } out: cel->vlp[2] = vlp; return (ret); } static void cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, struct mtx *blp2) { MPASS(cel->blp[0] == NULL); MPASS(cel->blp[1] == NULL); cache_sort_vnodes(&blp1, &blp2); if (blp1 != NULL) { mtx_lock(blp1); cel->blp[0] = blp1; } mtx_lock(blp2); cel->blp[1] = blp2; } static void cache_unlock_buckets_cel(struct celockstate *cel) { if (cel->blp[0] != NULL) mtx_unlock(cel->blp[0]); mtx_unlock(cel->blp[1]); } /* * Lock part of the cache affected by the insertion. * * This means vnodelocks for dvp, vp and the relevant bucketlock. * However, insertion can result in removal of an old entry. In this * case we have an additional vnode and bucketlock pair to lock. * * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while * preserving the locking order (smaller address first). */ static void cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct mtx *blps[2]; u_char nc_flag; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); if (vp == NULL || vp->v_type != VDIR) break; ncp = atomic_load_consume_ptr(&vp->v_cache_dd); if (ncp == NULL) break; nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == vp); blps[1] = NCP2BUCKETLOCK(ncp); if ((nc_flag & NCF_NEGATIVE) != 0) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; /* * All vnodes got re-locked. Re-validate the state and if * nothing changed we are done. Otherwise restart. */ if (ncp == vp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct mtx *blps[2]; u_char nc_flag; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); if (ncp == NULL) break; nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == dvp); blps[1] = NCP2BUCKETLOCK(ncp); if ((nc_flag & NCF_NEGATIVE) != 0) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; if (ncp == dvp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_unlock(struct celockstate *cel) { cache_unlock_buckets_cel(cel); cache_unlock_vnodes_cel(cel); } static void __noinline cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct celockstate cel; struct namecache *ncp; uint32_t hash; int len; if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) return; len = cnp->cn_namelen; cache_celockstate_init(&cel); hash = cache_get_hash(cnp->cn_nameptr, len, dvp); cache_enter_lock_dd(&cel, dvp, vp, hash); ncp = dvp->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); cache_zap_locked(ncp); } else { ncp = NULL; } atomic_store_ptr(&dvp->v_cache_dd, NULL); cache_enter_unlock(&cel); if (ncp != NULL) cache_free(ncp); } /* * Add an entry to the cache. */ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp) { struct celockstate cel; struct namecache *ncp, *n2, *ndd; struct namecache_ts *ncp_ts; struct nchashhead *ncpp; uint32_t hash; int flag; int len; KASSERT(cnp->cn_namelen <= NAME_MAX, ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, NAME_MAX)); VNPASS(!VN_IS_DOOMED(dvp), dvp); VNPASS(dvp->v_type != VNON, dvp); if (vp != NULL) { VNPASS(!VN_IS_DOOMED(vp), vp); VNPASS(vp->v_type != VNON, vp); } if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { KASSERT(dvp == vp, ("%s: different vnodes for dot entry (%p; %p)\n", __func__, dvp, vp)); } else { KASSERT(dvp != vp, ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__, cnp->cn_nameptr, dvp)); } #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) return; #endif flag = 0; if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { cache_enter_dotdot_prep(dvp, vp, cnp); flag = NCF_ISDOTDOT; } } ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); if (ncp == NULL) return; cache_celockstate_init(&cel); ndd = NULL; ncp_ts = NULL; /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp->nc_flag = flag | NCF_WIP; ncp->nc_vp = vp; if (vp == NULL) cache_neg_init(ncp); ncp->nc_dvp = dvp; if (tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); ncp_ts->nc_time = *tsp; ncp_ts->nc_ticks = ticks; ncp_ts->nc_nc.nc_flag |= NCF_TS; if (dtsp != NULL) { ncp_ts->nc_dotdottime = *dtsp; ncp_ts->nc_nc.nc_flag |= NCF_DTS; } } len = ncp->nc_nlen = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); memcpy(ncp->nc_name, cnp->cn_nameptr, len); ncp->nc_name[len] = '\0'; cache_enter_lock(&cel, dvp, vp, hash); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ ncpp = NCHHASH(hash); CK_SLIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { MPASS(cache_ncp_canuse(n2)); if ((n2->nc_flag & NCF_NEGATIVE) != 0) KASSERT(vp == NULL, ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", __func__, NULL, vp, cnp->cn_nameptr)); else KASSERT(n2->nc_vp == vp, ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", __func__, n2->nc_vp, vp, cnp->cn_nameptr)); /* * Entries are supposed to be immutable unless in the * process of getting destroyed. Accommodating for * changing timestamps is possible but not worth it. * This should be harmless in terms of correctness, in * the worst case resulting in an earlier expiration. * Alternatively, the found entry can be replaced * altogether. */ MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); #if 0 if (tsp != NULL) { KASSERT((n2->nc_flag & NCF_TS) != 0, ("no NCF_TS")); n2_ts = __containerof(n2, struct namecache_ts, nc_nc); n2_ts->nc_time = ncp_ts->nc_time; n2_ts->nc_ticks = ncp_ts->nc_ticks; if (dtsp != NULL) { n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; n2_ts->nc_nc.nc_flag |= NCF_DTS; } } #endif SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, vp); goto out_unlock_free; } } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) goto out_unlock_free; KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); atomic_thread_fence_rel(); atomic_store_ptr(&dvp->v_cache_dd, ncp); } if (vp != NULL) { if (flag != NCF_ISDOTDOT) { /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. */ if ((ndd = vp->v_cache_dd) != NULL) { if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) cache_zap_locked(ndd); else ndd = NULL; } atomic_thread_fence_rel(); atomic_store_ptr(&vp->v_cache_dd, ncp); } else if (vp->v_type != VDIR) { if (vp->v_cache_dd != NULL) { atomic_store_ptr(&vp->v_cache_dd, NULL); } } } if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { cache_hold_vnode(dvp); } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, vp); } else { if (cnp->cn_flags & ISWHITEOUT) atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); cache_neg_insert(ncp); SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, ncp->nc_name); } /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); atomic_thread_fence_rel(); /* * Mark the entry as fully constructed. * It is immutable past this point until its removal. */ atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); cache_enter_unlock(&cel); if (ndd != NULL) cache_free(ndd); return; out_unlock_free: cache_enter_unlock(&cel); cache_free(ncp); return; } /* * A variant of the above accepting flags. * * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it. * * TODO: this routine is a hack. It blindly removes the old entry, even if it * happens to match and it is doing it in an inefficient manner. It was added * to accommodate NFS which runs into a case where the target for a given name * may change from under it. Note this does nothing to solve the following * race: 2 callers of cache_enter_time_flags pass a different target vnode for * the same [dvp, cnp]. It may be argued that code doing this is broken. */ void cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp, int flags) { MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0); if (flags & VFS_CACHE_DROPOLD) cache_remove_cnp(dvp, cnp); cache_enter_time(dvp, vp, cnp, tsp, dtsp); } static u_int cache_roundup_2(u_int val) { u_int res; for (res = 1; res <= val; res <<= 1) continue; return (res); } static struct nchashhead * nchinittbl(u_long elements, u_long *hashmask) { struct nchashhead *hashtbl; u_long hashsize, i; hashsize = cache_roundup_2(elements) / 2; hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); for (i = 0; i < hashsize; i++) CK_SLIST_INIT(&hashtbl[i]); *hashmask = hashsize - 1; return (hashtbl); } static void ncfreetbl(struct nchashhead *hashtbl) { free(hashtbl, M_VFSCACHE); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { u_int i; cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); VFS_SMR_ZONE_SET(cache_zone_small); VFS_SMR_ZONE_SET(cache_zone_small_ts); VFS_SMR_ZONE_SET(cache_zone_large); VFS_SMR_ZONE_SET(cache_zone_large_ts); ncsize = desiredvnodes * ncsizefactor; cache_recalc_neg_min(ncnegminpct); nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ ncbuckethash = 7; if (ncbuckethash > nchash) ncbuckethash = nchash; bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numbucketlocks; i++) mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); ncvnodehash = ncbuckethash; vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numvnodelocks; i++) mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); for (i = 0; i < numneglists; i++) { mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); TAILQ_INIT(&neglists[i].nl_list); TAILQ_INIT(&neglists[i].nl_hotlist); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); void cache_vnode_init(struct vnode *vp) { LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); vp->v_cache_dd = NULL; cache_prehash(vp); } /* * Induce transient cache misses for lockless operation in cache_lookup() by * using a temporary hash table. * * This will force a fs lookup. * * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time * to observe all CPUs not performing the lookup. */ static void cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash) { MPASS(temphash < nchash); /* * Change the size. The new size is smaller and can safely be used * against the existing table. All lookups which now hash wrong will * result in a cache miss, which all callers are supposed to know how * to handle. */ atomic_store_long(&nchash, temphash); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated hash value, but they still * see the old table. */ atomic_store_ptr(&nchashtbl, temptbl); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated table pointer and size pair. */ } /* * Set the new hash table. * * Similarly to cache_changesize_set_temp(), this has to synchronize against * lockless operation in cache_lookup(). */ static void cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash) { MPASS(nchash < new_hash); /* * Change the pointer first. This wont result in out of bounds access * since the temporary table is guaranteed to be smaller. */ atomic_store_ptr(&nchashtbl, new_tbl); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated pointer value, but they * still see the old size. */ atomic_store_long(&nchash, new_hash); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated table pointer and size pair. */ } void cache_changesize(u_long newmaxvnodes) { struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl; u_long new_nchash, old_nchash, temphash; struct namecache *ncp; uint32_t hash; u_long newncsize; int i; newncsize = newmaxvnodes * ncsizefactor; newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); if (newmaxvnodes < numbucketlocks) newmaxvnodes = numbucketlocks; new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); /* If same hash table size, nothing to do */ if (nchash == new_nchash) { ncfreetbl(new_nchashtbl); return; } temptbl = nchinittbl(1, &temphash); /* * Move everything from the old hash table to the new table. * None of the namecache entries in the table can be removed * because to do so, they have to be removed from the hash table. */ cache_lock_all_vnodes(); cache_lock_all_buckets(); old_nchashtbl = nchashtbl; old_nchash = nchash; cache_changesize_set_temp(temptbl, temphash); for (i = 0; i <= old_nchash; i++) { while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash); } } ncsize = newncsize; cache_recalc_neg_min(ncnegminpct); cache_changesize_set_new(new_nchashtbl, new_nchash); cache_unlock_all_buckets(); cache_unlock_all_vnodes(); ncfreetbl(old_nchashtbl); ncfreetbl(temptbl); } /* * Remove all entries from and to a particular vnode. */ static void cache_purge_impl(struct vnode *vp) { struct cache_freebatch batch; struct namecache *ncp; struct mtx *vlp, *vlp2; TAILQ_INIT(&batch); vlp = VP2VNODELOCK(vp); vlp2 = NULL; mtx_lock(vlp); retry: while (!LIST_EMPTY(&vp->v_cache_src)) { ncp = LIST_FIRST(&vp->v_cache_src); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } while (!TAILQ_EMPTY(&vp->v_cache_dst)) { ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } ncp = vp->v_cache_dd; if (ncp != NULL) { KASSERT(ncp->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); mtx_unlock(vlp); if (vlp2 != NULL) mtx_unlock(vlp2); cache_free_batch(&batch); } /* * Opportunistic check to see if there is anything to do. */ static bool cache_has_entries(struct vnode *vp) { if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && atomic_load_ptr(&vp->v_cache_dd) == NULL) return (false); return (true); } void cache_purge(struct vnode *vp) { SDT_PROBE1(vfs, namecache, purge, done, vp); if (!cache_has_entries(vp)) return; cache_purge_impl(vp); } /* * Only to be used by vgone. */ void cache_purge_vgone(struct vnode *vp) { struct mtx *vlp; VNPASS(VN_IS_DOOMED(vp), vp); if (cache_has_entries(vp)) { cache_purge_impl(vp); return; } /* * Serialize against a potential thread doing cache_purge. */ vlp = VP2VNODELOCK(vp); mtx_wait_unlocked(vlp); if (cache_has_entries(vp)) { cache_purge_impl(vp); return; } return; } /* * Remove all negative entries for a particular directory vnode. */ void cache_purge_negative(struct vnode *vp) { struct cache_freebatch batch; struct namecache *ncp, *nnp; struct mtx *vlp; SDT_PROBE1(vfs, namecache, purge_negative, done, vp); if (LIST_EMPTY(&vp->v_cache_src)) return; TAILQ_INIT(&batch); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { if (!(ncp->nc_flag & NCF_NEGATIVE)) continue; cache_zap_negative_locked_vnode_kl(ncp, vp); TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } mtx_unlock(vlp); cache_free_batch(&batch); } /* * Entry points for modifying VOP operations. */ void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) { ASSERT_VOP_IN_SEQC(fdvp); ASSERT_VOP_IN_SEQC(fvp); ASSERT_VOP_IN_SEQC(tdvp); if (tvp != NULL) ASSERT_VOP_IN_SEQC(tvp); cache_purge(fvp); if (tvp != NULL) { cache_purge(tvp); KASSERT(!cache_remove_cnp(tdvp, tcnp), ("%s: lingering negative entry", __func__)); } else { cache_remove_cnp(tdvp, tcnp); } /* * TODO * * Historically renaming was always purging all revelang entries, * but that's quite wasteful. In particular turns out that in many cases * the target file is immediately accessed after rename, inducing a cache * miss. * * Recode this to reduce relocking and reuse the existing entry (if any) * instead of just removing it above and allocating a new one here. */ cache_enter(tdvp, fvp, tcnp); } void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) { ASSERT_VOP_IN_SEQC(dvp); ASSERT_VOP_IN_SEQC(vp); cache_purge(vp); } #ifdef INVARIANTS /* * Validate that if an entry exists it matches. */ void cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct namecache *ncp; struct mtx *blp; uint32_t hash; hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); if (CK_SLIST_EMPTY(NCHHASH(hash))) return; blp = HASH2BUCKETLOCK(hash); mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { if (ncp->nc_vp != vp) panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); } } mtx_unlock(blp); } #endif /* * Flush all entries referencing a particular filesystem. */ void cache_purgevfs(struct mount *mp) { struct vnode *vp, *mvp; size_t visited, purged; visited = purged = 0; /* * Somewhat wasteful iteration over all vnodes. Would be better to * support filtering and avoid the interlock to begin with. */ MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { visited++; if (!cache_has_entries(vp)) { VI_UNLOCK(vp); continue; } vholdl(vp); VI_UNLOCK(vp); cache_purge(vp); purged++; vdrop(vp); } SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged); } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; int flags = cnp->cn_flags; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } /* Implementation of the getcwd syscall. */ int sys___getcwd(struct thread *td, struct __getcwd_args *uap) { char *buf, *retbuf; size_t buflen; int error; buflen = uap->buflen; if (__predict_false(buflen < 2)) return (EINVAL); if (buflen > MAXPATHLEN) buflen = MAXPATHLEN; buf = uma_zalloc(namei_zone, M_WAITOK); error = vn_getcwd(buf, &retbuf, &buflen); if (error == 0) error = copyout(retbuf, uap->buf, buflen); uma_zfree(namei_zone, buf); return (error); } int vn_getcwd(char *buf, char **retbuf, size_t *buflen) { struct pwd *pwd; int error; vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); pwd_drop(pwd); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) ktrnamei(*retbuf); #endif return (error); } /* * Canonicalize a path by walking it forward and back. * * BUGS: * - Nothing guarantees the integrity of the entire chain. Consider the case * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of * "foo" into "quux" during the backwards walk. The result will be * "quux/bar/baz/qux", which could not have been obtained by an incremental * walk in userspace. Moreover, the path we return is inaccessible if the * calling thread lacks permission to traverse "quux". */ static int kern___realpathat(struct thread *td, int fd, const char *path, char *buf, size_t size, int flags, enum uio_seg pathseg) { struct nameidata nd; char *retbuf, *freebuf; int error; if (flags != 0) return (EINVAL); - NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, + NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights); if ((error = namei(&nd)) != 0) return (error); error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size); if (error == 0) { error = copyout(retbuf, buf, size); free(freebuf, M_TEMP); } NDFREE(&nd, 0); return (error); } int sys___realpathat(struct thread *td, struct __realpathat_args *uap) { return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, uap->flags, UIO_USERSPACE)); } /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) { struct pwd *pwd; char *buf; size_t buflen; int error; if (__predict_false(vp == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); pwd_drop(pwd); } if (error == 0) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) { char *buf; size_t buflen; int error; if (__predict_false(vp == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); vfs_smr_enter(); error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); } if (error == 0) *freebuf = buf; else free(buf, M_TEMP); return (error); } static struct namecache * vn_dd_from_dst(struct vnode *vp) { struct namecache *ncp; cache_assert_vnode_locked(vp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) return (ncp); } return (NULL); } int vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) { struct vnode *dvp; struct namecache *ncp; struct mtx *vlp; int error; vlp = VP2VNODELOCK(*vp); mtx_lock(vlp); ncp = (*vp)->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { KASSERT(ncp == vn_dd_from_dst(*vp), ("%s: mismatch for dd entry (%p != %p)", __func__, ncp, vn_dd_from_dst(*vp))); } else { ncp = vn_dd_from_dst(*vp); } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { mtx_unlock(vlp); vrele(*vp); counter_u64_add(numfullpathfail4, 1); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, ncp->nc_name, vp); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); mtx_unlock(vlp); vrele(dvp); return (0); } SDT_PROBE1(vfs, namecache, fullpath, miss, vp); mtx_unlock(vlp); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); vput(*vp); if (error) { counter_u64_add(numfullpathfail2, 1); SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *vp = dvp; if (VN_IS_DOOMED(dvp)) { /* forced unmount */ vrele(dvp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * Resolve a directory to a pathname. * * The name of the directory can always be found in the namecache or fetched * from the filesystem. There is also guaranteed to be only one parent, meaning * we can just follow vnodes up until we find the root. * * The vnode must be referenced. */ static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; size_t buflen; int error; bool slash_prefixed; VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); VNPASS(vp->v_usecount > 0, vp); buflen = *len; slash_prefixed = true; if (addend == 0) { MPASS(*len >= 2); buflen--; buf[buflen] = '\0'; slash_prefixed = false; } error = 0; SDT_PROBE1(vfs, namecache, fullpath, entry, vp); counter_u64_add(numfullpathcalls, 1); while (vp != rdir && vp != rootvnode) { /* * The vp vnode must be already fully constructed, * since it is either found in namecache or obtained * from VOP_VPTOCNP(). We may test for VV_ROOT safely * without obtaining the vnode lock. */ if ((vp->v_vflag & VV_ROOT) != 0) { vn_lock(vp, LK_RETRY | LK_SHARED); /* * With the vnode locked, check for races with * unmount, forced or not. Note that we * already verified that vp is not equal to * the root vnode, which means that * mnt_vnodecovered can be NULL only for the * case of unmount. */ if (VN_IS_DOOMED(vp) || (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || vp1->v_mountedhere != vp->v_mount) { vput(vp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } vref(vp1); vput(vp); vp = vp1; continue; } if (vp->v_type != VDIR) { vrele(vp); counter_u64_add(numfullpathfail1, 1); error = ENOTDIR; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } error = vn_vptocnp(&vp, buf, &buflen); if (error) break; if (buflen == 0) { vrele(vp); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, startvp, NULL); break; } buf[--buflen] = '/'; slash_prefixed = true; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { vrele(vp); counter_u64_add(numfullpathfail4, 1); SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL); return (ENOMEM); } buf[--buflen] = '/'; } counter_u64_add(numfullpathfound, 1); vrele(vp); *retbuf = buf + buflen; SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); *len -= buflen; *len += addend; return (0); } /* * Resolve an arbitrary vnode to a pathname. * * Note 2 caveats: * - hardlinks are not tracked, thus if the vnode is not a directory this can * resolve to a different path than the one used to find it * - namecache is not mandatory, meaning names are not guaranteed to be added * (in which case resolving fails) */ static void __inline cache_rev_failed_impl(int *reason, int line) { *reason = line; } #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *tvp; struct mount *mp; struct namecache *ncp; size_t orig_buflen; int reason; int error; #ifdef KDTRACE_HOOKS int i; #endif seqc_t vp_seqc, tvp_seqc; u_char nc_flag; VFS_SMR_ASSERT_ENTERED(); if (!atomic_load_char(&cache_fast_lookup_enabled)) { vfs_smr_exit(); return (-1); } orig_buflen = *buflen; if (addend == 0) { MPASS(*buflen >= 2); *buflen -= 1; buf[*buflen] = '\0'; } if (vp == rdir || vp == rootvnode) { if (addend == 0) { *buflen -= 1; buf[*buflen] = '/'; } goto out_ok; } #ifdef KDTRACE_HOOKS i = 0; #endif error = -1; ncp = NULL; /* for sdt probe down below */ vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } for (;;) { #ifdef KDTRACE_HOOKS i++; #endif if ((vp->v_vflag & VV_ROOT) != 0) { mp = atomic_load_ptr(&vp->v_mount); if (mp == NULL) { cache_rev_failed(&reason); goto out_abort; } tvp = atomic_load_ptr(&mp->mnt_vnodecovered); tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(tvp_seqc)) { cache_rev_failed(&reason); goto out_abort; } if (!vn_seqc_consistent(vp, vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } vp = tvp; vp_seqc = tvp_seqc; continue; } ncp = atomic_load_consume_ptr(&vp->v_cache_dd); if (ncp == NULL) { cache_rev_failed(&reason); goto out_abort; } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { cache_rev_failed(&reason); goto out_abort; } if (ncp->nc_nlen >= *buflen) { cache_rev_failed(&reason); error = ENOMEM; goto out_abort; } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); *buflen -= 1; buf[*buflen] = '/'; tvp = ncp->nc_dvp; tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(tvp_seqc)) { cache_rev_failed(&reason); goto out_abort; } if (!vn_seqc_consistent(vp, vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } /* * Acquire fence provided by vn_seqc_read_any above. */ if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { cache_rev_failed(&reason); goto out_abort; } if (!cache_ncp_canuse(ncp)) { cache_rev_failed(&reason); goto out_abort; } vp = tvp; vp_seqc = tvp_seqc; if (vp == rdir || vp == rootvnode) break; } out_ok: vfs_smr_exit(); *retbuf = buf + *buflen; *buflen = orig_buflen - *buflen + addend; SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); return (0); out_abort: *buflen = orig_buflen; SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); vfs_smr_exit(); return (error); } static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen) { size_t orig_buflen, addend; int error; if (*buflen < 2) return (EINVAL); orig_buflen = *buflen; vref(vp); addend = 0; if (vp->v_type != VDIR) { *buflen -= 1; buf[*buflen] = '\0'; error = vn_vptocnp(&vp, buf, buflen); if (error) return (error); if (*buflen == 0) { vrele(vp); return (ENOMEM); } *buflen -= 1; buf[*buflen] = '/'; addend = orig_buflen - *buflen; } return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); } /* * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). * - * Since the namecache does not track hardlinks, the caller is - * expected to first look up the target vnode with SAVENAME | - * WANTPARENT flags passed to namei to get dvp and vp. + * Since the namecache does not track hardlinks, the caller is expected to + * first look up the target vnode with WANTPARENT flag passed to namei to get + * dvp and vp. * * Then we have 2 cases: * - if the found vnode is a directory, the path can be constructed just by * following names up the chain * - otherwise we populate the buffer with the saved name and start resolving * from the parent */ int vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp, const char *hrdl_name, size_t hrdl_name_length, char **retbuf, char **freebuf, size_t *buflen) { char *buf, *tmpbuf; struct pwd *pwd; size_t addend; int error; enum vtype type; if (*buflen < 2) return (EINVAL); if (*buflen > MAXPATHLEN) *buflen = MAXPATHLEN; buf = malloc(*buflen, M_TEMP, M_WAITOK); addend = 0; /* * Check for VBAD to work around the vp_crossmp bug in lookup(). * * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be * set to mount point's root vnode while ni_dvp will be vp_crossmp. * If the type is VDIR (like in this very case) we can skip looking * at ni_dvp in the first place. However, since vnodes get passed here * unlocked the target may transition to doomed state (type == VBAD) * before we get to evaluate the condition. If this happens, we will * populate part of the buffer and descend to vn_fullpath_dir with * vp == vp_crossmp. Prevent the problem by checking for VBAD. * * This should be atomic_load(&vp->v_type) but it is illegal to take * an address of a bit field, even if said field is sized to char. * Work around the problem by reading the value into a full-sized enum * and then re-reading it with atomic_load which will still prevent * the compiler from re-reading down the road. */ type = vp->v_type; type = atomic_load_int(&type); if (type == VBAD) { error = ENOENT; goto out_bad; } if (type != VDIR) { addend = hrdl_name_length + 2; if (*buflen < addend) { error = ENOMEM; goto out_bad; } *buflen -= addend; tmpbuf = buf + *buflen; tmpbuf[0] = '/'; memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length); tmpbuf[addend - 1] = '\0'; vp = dvp; } vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, addend); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); vref(vp); error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, addend); pwd_drop(pwd); } if (error != 0) goto out_bad; *freebuf = buf; return (0); out_bad: free(buf, M_TEMP); return (error); } struct vnode * vn_dir_dd_ino(struct vnode *vp) { struct namecache *ncp; struct vnode *ddvp; struct mtx *vlp; enum vgetstate vs; ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) continue; ddvp = ncp->nc_dvp; vs = vget_prep(ddvp); mtx_unlock(vlp); if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) return (NULL); return (ddvp); } mtx_unlock(vlp); return (NULL); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; struct mtx *vlp; int l; vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { mtx_unlock(vlp); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, ncp->nc_name, l); mtx_unlock(vlp); buf[l] = '\0'; return (0); } /* * This function updates path string to vnode's full global path * and checks the size of the new path string against the pathlen argument. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. * * If vp is a directory, the call to vn_fullpath_global() always succeeds * because it falls back to the ".." lookup if the namecache lookup fails. */ int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* Construct global filesystem path from vp. */ VOP_UNLOCK(vp); error = vn_fullpath_global(vp, &rpath, &fbuf); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE_PNBUF(&nd); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } #ifdef DDB static void db_print_vpath(struct vnode *vp) { while (vp != NULL) { db_printf("%p: ", vp); if (vp == rootvnode) { db_printf("/"); vp = NULL; } else { if (vp->v_vflag & VV_ROOT) { db_printf(""); vp = vp->v_mount->mnt_vnodecovered; } else { struct namecache *ncp; char *ncn; int i; ncp = TAILQ_FIRST(&vp->v_cache_dst); if (ncp != NULL) { ncn = ncp->nc_name; for (i = 0; i < ncp->nc_nlen; i++) db_printf("%c", *ncn++); vp = ncp->nc_dvp; } else { vp = NULL; } } } db_printf("\n"); } return; } DB_SHOW_COMMAND(vpath, db_show_vpath) { struct vnode *vp; if (!have_addr) { db_printf("usage: show vpath \n"); return; } vp = (struct vnode *)addr; db_print_vpath(vp); } #endif static int cache_fast_lookup = 1; #define CACHE_FPL_FAILED -2020 void cache_fast_lookup_enabled_recalc(void) { int lookup_flag; int mac_on; #ifdef MAC mac_on = mac_vnode_check_lookup_enabled(); mac_on |= mac_vnode_check_readlink_enabled(); #else mac_on = 0; #endif lookup_flag = atomic_load_int(&cache_fast_lookup); if (lookup_flag && !mac_on) { atomic_store_char(&cache_fast_lookup_enabled, true); } else { atomic_store_char(&cache_fast_lookup_enabled, false); } } static int syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) { int error, old; old = atomic_load_int(&cache_fast_lookup); error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) cache_fast_lookup_enabled_recalc(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); /* * Components of nameidata (or objects it can point to) which may * need restoring in case fast path lookup fails. */ struct nameidata_outer { size_t ni_pathlen; int cn_flags; }; struct nameidata_saved { #ifdef INVARIANTS char *cn_nameptr; size_t ni_pathlen; #endif }; #ifdef INVARIANTS struct cache_fpl_debug { size_t ni_pathlen; }; #endif struct cache_fpl { struct nameidata *ndp; struct componentname *cnp; char *nulchar; struct vnode *dvp; struct vnode *tvp; seqc_t dvp_seqc; seqc_t tvp_seqc; uint32_t hash; struct nameidata_saved snd; struct nameidata_outer snd_outer; int line; enum cache_fpl_status status:8; bool in_smr; bool fsearch; - bool savename; struct pwd **pwd; #ifdef INVARIANTS struct cache_fpl_debug debug; #endif }; static bool cache_fplookup_mp_supported(struct mount *mp); static bool cache_fplookup_is_mp(struct cache_fpl *fpl); static int cache_fplookup_cross_mount(struct cache_fpl *fpl); static int cache_fplookup_partial_setup(struct cache_fpl *fpl); static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); static int cache_fplookup_trailingslash(struct cache_fpl *fpl); static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); static void cache_fpl_cleanup_cnp(struct componentname *cnp) { uma_zfree(namei_zone, cnp->cn_pnbuf); -#ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; -#endif } static struct vnode * cache_fpl_handle_root(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; MPASS(*(cnp->cn_nameptr) == '/'); cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); if (__predict_false(*(cnp->cn_nameptr) == '/')) { do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); } return (ndp->ni_rootdir); } static void cache_fpl_checkpoint_outer(struct cache_fpl *fpl) { fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; } static void cache_fpl_checkpoint(struct cache_fpl *fpl) { #ifdef INVARIANTS fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; #endif } static void cache_fpl_restore_partial(struct cache_fpl *fpl) { fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; #ifdef INVARIANTS fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; #endif } static void cache_fpl_restore_abort(struct cache_fpl *fpl) { cache_fpl_restore_partial(fpl); /* * It is 0 on entry by API contract. */ fpl->ndp->ni_resflags = 0; fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; } #ifdef INVARIANTS #define cache_fpl_smr_assert_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ VFS_SMR_ASSERT_ENTERED(); \ }) #define cache_fpl_smr_assert_not_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ VFS_SMR_ASSERT_NOT_ENTERED(); \ }) static void cache_fpl_assert_status(struct cache_fpl *fpl) { switch (fpl->status) { case CACHE_FPL_STATUS_UNSET: __assert_unreachable(); break; case CACHE_FPL_STATUS_DESTROYED: case CACHE_FPL_STATUS_ABORTED: case CACHE_FPL_STATUS_PARTIAL: case CACHE_FPL_STATUS_HANDLED: break; } } #else #define cache_fpl_smr_assert_entered(fpl) do { } while (0) #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) #define cache_fpl_assert_status(fpl) do { } while (0) #endif #define cache_fpl_smr_enter_initial(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_enter(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_exit(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ vfs_smr_exit(); \ _fpl->in_smr = false; \ }) static int cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) { if (fpl->status != CACHE_FPL_STATUS_UNSET) { KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, ("%s: converting to abort from %d at %d, set at %d\n", __func__, fpl->status, line, fpl->line)); } cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_ABORTED; fpl->line = line; return (CACHE_FPL_FAILED); } #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) static int __noinline cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; if (fpl->status != CACHE_FPL_STATUS_UNSET) { KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, ("%s: converting to abort from %d at %d, set at %d\n", __func__, fpl->status, line, fpl->line)); } fpl->status = CACHE_FPL_STATUS_ABORTED; fpl->line = line; if (fpl->in_smr) cache_fpl_smr_exit(fpl); cache_fpl_restore_abort(fpl); /* * Resolving symlinks overwrites data passed by the caller. * Let namei know. */ if (ndp->ni_loopcnt > 0) { fpl->status = CACHE_FPL_STATUS_DESTROYED; cache_fpl_cleanup_cnp(cnp); } return (CACHE_FPL_FAILED); } #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) static int __noinline cache_fpl_partial_impl(struct cache_fpl *fpl, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to partial at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_entered(fpl); fpl->status = CACHE_FPL_STATUS_PARTIAL; fpl->line = line; return (cache_fplookup_partial_setup(fpl)); } #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) static int cache_fpl_handled_impl(struct cache_fpl *fpl, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to handled at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_HANDLED; fpl->line = line; return (0); } #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) static int cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to handled at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); MPASS(error != 0); MPASS(error != CACHE_FPL_FAILED); cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_HANDLED; fpl->line = line; fpl->dvp = NULL; fpl->tvp = NULL; - fpl->savename = false; return (error); } #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) static bool cache_fpl_terminated(struct cache_fpl *fpl) { return (fpl->status != CACHE_FPL_STATUS_UNSET); } #define CACHE_FPL_SUPPORTED_CN_FLAGS \ (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ - FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | SAVENAME | SAVESTART | \ - WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | \ - OPENREAD | OPENWRITE | WANTIOCTLCAPS) + FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | SAVESTART | WILLBEDIR | \ + ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \ + OPENWRITE | WANTIOCTLCAPS) #define CACHE_FPL_INTERNAL_CN_FLAGS \ (ISDOTDOT | MAKEENTRY | ISLASTCN) _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, "supported and internal flags overlap"); static bool cache_fpl_islastcn(struct nameidata *ndp) { return (*ndp->ni_next == 0); } static bool cache_fpl_istrailingslash(struct cache_fpl *fpl) { MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf); return (*(fpl->nulchar - 1) == '/'); } static bool cache_fpl_isdotdot(struct componentname *cnp) { if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') return (true); return (false); } static bool cache_can_fplookup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct thread *td; ndp = fpl->ndp; cnp = fpl->cnp; td = curthread; if (!atomic_load_char(&cache_fast_lookup_enabled)) { cache_fpl_aborted_early(fpl); return (false); } if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { cache_fpl_aborted_early(fpl); return (false); } if (IN_CAPABILITY_MODE(td)) { cache_fpl_aborted_early(fpl); return (false); } if (AUDITING_TD(td)) { cache_fpl_aborted_early(fpl); return (false); } if (ndp->ni_startdir != NULL) { cache_fpl_aborted_early(fpl); return (false); } return (true); } static int __noinline cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) { struct nameidata *ndp; struct componentname *cnp; int error; bool fsearch; ndp = fpl->ndp; cnp = fpl->cnp; error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } fpl->fsearch = fsearch; if ((*vpp)->v_type != VDIR) { if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } } return (0); } static int __noinline cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, uint32_t hash) { struct componentname *cnp; struct vnode *dvp; cnp = fpl->cnp; dvp = fpl->dvp; cache_fpl_smr_exit(fpl); if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) return (cache_fpl_handled_error(fpl, ENOENT)); else return (cache_fpl_aborted(fpl)); } /* * The target vnode is not supported, prepare for the slow path to take over. */ static int __noinline cache_fplookup_partial_setup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp; struct pwd *pwd; seqc_t dvp_seqc; ndp = fpl->ndp; cnp = fpl->cnp; pwd = *(fpl->pwd); dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; if (!pwd_hold_smr(pwd)) { return (cache_fpl_aborted(fpl)); } /* * Note that seqc is checked before the vnode is locked, so by * the time regular lookup gets to it it may have moved. * * Ultimately this does not affect correctness, any lookup errors * are userspace racing with itself. It is guaranteed that any * path which ultimately gets found could also have been found * by regular lookup going all the way in absence of concurrent * modifications. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } cache_fpl_restore_partial(fpl); #ifdef INVARIANTS if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); } #endif ndp->ni_startdir = dvp; cnp->cn_flags |= MAKEENTRY; if (cache_fpl_islastcn(ndp)) cnp->cn_flags |= ISLASTCN; if (cache_fpl_isdotdot(cnp)) cnp->cn_flags |= ISDOTDOT; /* * Skip potential extra slashes parsing did not take care of. * cache_fplookup_skip_slashes explains the mechanism. */ if (__predict_false(*(cnp->cn_nameptr) == '/')) { do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); } ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; #ifdef INVARIANTS if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif return (0); } static int cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) { struct componentname *cnp; struct vnode *tvp; seqc_t tvp_seqc; int error, lkflags; cnp = fpl->cnp; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(tvp, lkflags, tvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(tvp, tvs); } if (!vn_seqc_consistent(tvp, tvp_seqc)) { if ((cnp->cn_flags & LOCKLEAF) != 0) vput(tvp); else vrele(tvp); return (cache_fpl_aborted(fpl)); } return (cache_fpl_handled(fpl)); } /* * They want to possibly modify the state of the namecache. */ static int __noinline cache_fplookup_final_modifying(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp, *tvp; struct mount *mp; seqc_t dvp_seqc; int error; bool docache; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; MPASS(*(cnp->cn_nameptr) != '/'); MPASS(cache_fpl_islastcn(ndp)); if ((cnp->cn_flags & LOCKPARENT) == 0) MPASS((cnp->cn_flags & WANTPARENT) != 0); MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME); MPASS((cnp->cn_flags & MAKEENTRY) == 0); MPASS((cnp->cn_flags & ISDOTDOT) == 0); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) docache = false; /* * Regular lookup nulifies the slash, which we don't do here. * Don't take chances with filesystem routines seeing it for * the last entry. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_partial(fpl)); } mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { cache_fpl_smr_exit(fpl); /* * Original code keeps not checking for CREATE which * might be a bug. For now let the old lookup decide. */ if (cnp->cn_nameiop == CREATE) { return (cache_fpl_aborted(fpl)); } return (cache_fpl_handled_error(fpl, EROFS)); } if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EEXIST)); } /* * Secure access to dvp; check cache_fplookup_partial_setup for * reasoning. * * XXX At least UFS requires its lookup routine to be called for * the last path component, which leads to some level of complication * and inefficiency: * - the target routine always locks the target vnode, but our caller * may not need it locked * - some of the VOP machinery asserts that the parent is locked, which * once more may be not required * * TODO: add a flag for filesystems which don't need this. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } error = vn_lock(dvp, LK_EXCLUSIVE); if (__predict_false(error != 0)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } tvp = NULL; cnp->cn_flags |= ISLASTCN; if (docache) cnp->cn_flags |= MAKEENTRY; if (cache_fpl_isdotdot(cnp)) cnp->cn_flags |= ISDOTDOT; cnp->cn_lkflags = LK_EXCLUSIVE; error = VOP_LOOKUP(dvp, &tvp, cnp); switch (error) { case EJUSTRETURN: case 0: break; case ENOTDIR: case ENOENT: vput(dvp); return (cache_fpl_handled_error(fpl, error)); default: vput(dvp); return (cache_fpl_aborted(fpl)); } fpl->tvp = tvp; - fpl->savename = (cnp->cn_flags & SAVENAME) != 0; if (tvp == NULL) { if ((cnp->cn_flags & SAVESTART) != 0) { ndp->ni_startdir = dvp; vrefact(ndp->ni_startdir); - cnp->cn_flags |= SAVENAME; - fpl->savename = true; } MPASS(error == EJUSTRETURN); if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } /* * There are very hairy corner cases concerning various flag combinations * and locking state. In particular here we only hold one lock instead of * two. * * Skip the complexity as it is of no significance for normal workloads. */ if (__predict_false(tvp == dvp)) { vput(dvp); vrele(tvp); return (cache_fpl_aborted(fpl)); } /* * If they want the symlink itself we are fine, but if they want to * follow it regular lookup has to be engaged. */ if (tvp->v_type == VLNK) { if ((cnp->cn_flags & FOLLOW) != 0) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } } /* * Since we expect this to be the terminal vnode it should almost never * be a mount point. */ if (__predict_false(cache_fplookup_is_mp(fpl))) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & FAILIFEXISTS) != 0) { vput(dvp); vput(tvp); return (cache_fpl_handled_error(fpl, EEXIST)); } if ((cnp->cn_flags & LOCKLEAF) == 0) { VOP_UNLOCK(tvp); } if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } if ((cnp->cn_flags & SAVESTART) != 0) { ndp->ni_startdir = dvp; vrefact(ndp->ni_startdir); - cnp->cn_flags |= SAVENAME; - fpl->savename = true; } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_modifying(struct cache_fpl *fpl) { struct nameidata *ndp; ndp = fpl->ndp; if (!cache_fpl_islastcn(ndp)) { return (cache_fpl_partial(fpl)); } return (cache_fplookup_final_modifying(fpl)); } static int __noinline cache_fplookup_final_withparent(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate dvs, tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; int error; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); /* * This is less efficient than it can be for simplicity. */ dvs = vget_prep_smr(dvp); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { cache_fpl_smr_exit(fpl); vget_abort(dvp, dvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); if ((cnp->cn_flags & LOCKPARENT) != 0) { error = vget_finish(dvp, LK_EXCLUSIVE, dvs); if (__predict_false(error != 0)) { vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(dvp, dvs); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { vget_abort(tvp, tvs); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (cache_fpl_aborted(fpl)); } error = cache_fplookup_final_child(fpl, tvs); if (__predict_false(error != 0)) { MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED || fpl->status == CACHE_FPL_STATUS_DESTROYED); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (error); } MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); return (0); } static int cache_fplookup_final(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; MPASS(*(cnp->cn_nameptr) != '/'); if (cnp->cn_nameiop != LOOKUP) { return (cache_fplookup_final_modifying(fpl)); } if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) return (cache_fplookup_final_withparent(fpl)); tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { return (cache_fpl_partial(fpl)); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { cache_fpl_smr_exit(fpl); vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); return (cache_fplookup_final_child(fpl, tvs)); } /* * Comment from locked lookup: * Check for degenerate name (e.g. / or "") which is a way of talking about a * directory, e.g. like "/." or ".". */ static int __noinline cache_fplookup_degenerate(struct cache_fpl *fpl) { struct componentname *cnp; struct vnode *dvp; enum vgetstate dvs; int error, lkflags; #ifdef INVARIANTS char *cp; #endif fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; #ifdef INVARIANTS for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { KASSERT(*cp == '/', ("%s: encountered non-slash; string [%s]\n", __func__, cnp->cn_pnbuf)); } #endif if (__predict_false(cnp->cn_nameiop != LOOKUP)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EISDIR)); } MPASS((cnp->cn_flags & SAVESTART) == 0); if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { return (cache_fplookup_final_withparent(fpl)); } dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(dvp, lkflags, dvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(dvp, dvs); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_emptypath(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate tvs; struct vnode *tvp; int error, lkflags; fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; ndp = fpl->ndp; cnp = fpl->cnp; tvp = fpl->tvp; MPASS(*cnp->cn_pnbuf == '\0'); if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOENT)); } MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); tvs = vget_prep_smr(tvp); cache_fpl_smr_exit(fpl); if (__predict_false(tvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(tvp, lkflags, tvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(tvp, tvs); } ndp->ni_resflags |= NIRES_EMPTYPATH; return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_noentry(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; int error; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; MPASS((cnp->cn_flags & MAKEENTRY) == 0); MPASS((cnp->cn_flags & ISDOTDOT) == 0); if (cnp->cn_nameiop == LOOKUP) MPASS((cnp->cn_flags & NOCACHE) == 0); MPASS(!cache_fpl_isdotdot(cnp)); /* * Hack: delayed name len checking. */ if (__predict_false(cnp->cn_namelen > NAME_MAX)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); } if (cnp->cn_nameptr[0] == '/') { return (cache_fplookup_skip_slashes(fpl)); } if (cnp->cn_pnbuf[0] == '\0') { return (cache_fplookup_emptypath(fpl)); } if (cnp->cn_nameptr[0] == '\0') { if (fpl->tvp == NULL) { return (cache_fplookup_degenerate(fpl)); } return (cache_fplookup_trailingslash(fpl)); } if (cnp->cn_nameiop != LOOKUP) { fpl->tvp = NULL; return (cache_fplookup_modifying(fpl)); } MPASS((cnp->cn_flags & SAVESTART) == 0); /* * Only try to fill in the component if it is the last one, * otherwise not only there may be several to handle but the * walk may be complicated. */ if (!cache_fpl_islastcn(ndp)) { return (cache_fpl_partial(fpl)); } /* * Regular lookup nulifies the slash, which we don't do here. * Don't take chances with filesystem routines seeing it for * the last entry. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_partial(fpl)); } /* * Secure access to dvp; check cache_fplookup_partial_setup for * reasoning. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } error = vn_lock(dvp, LK_SHARED); if (__predict_false(error != 0)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } tvp = NULL; /* * TODO: provide variants which don't require locking either vnode. */ cnp->cn_flags |= ISLASTCN | MAKEENTRY; cnp->cn_lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) { cnp->cn_lkflags = LK_EXCLUSIVE; } error = VOP_LOOKUP(dvp, &tvp, cnp); switch (error) { case EJUSTRETURN: case 0: break; case ENOTDIR: case ENOENT: vput(dvp); return (cache_fpl_handled_error(fpl, error)); default: vput(dvp); return (cache_fpl_aborted(fpl)); } fpl->tvp = tvp; - if (!fpl->savename) { - MPASS((cnp->cn_flags & SAVENAME) == 0); - } if (tvp == NULL) { MPASS(error == EJUSTRETURN); if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { vput(dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } if (tvp->v_type == VLNK) { if ((cnp->cn_flags & FOLLOW) != 0) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } } if (__predict_false(cache_fplookup_is_mp(fpl))) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) == 0) { VOP_UNLOCK(tvp); } if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { vput(dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_dot(struct cache_fpl *fpl) { int error; MPASS(!seqc_in_modify(fpl->dvp_seqc)); /* * Just re-assign the value. seqc will be checked later for the first * non-dot path component in line and/or before deciding to return the * vnode. */ fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); error = 0; if (cache_fplookup_is_mp(fpl)) { error = cache_fplookup_cross_mount(fpl); } return (error); } static int __noinline cache_fplookup_dotdot(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct namecache *ncp; struct vnode *dvp; struct prison *pr; u_char nc_flag; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; MPASS(cache_fpl_isdotdot(cnp)); /* * XXX this is racy the same way regular lookup is */ for (pr = cnp->cn_cred->cr_prison; pr != NULL; pr = pr->pr_parent) if (dvp == pr->pr_root) break; if (dvp == ndp->ni_rootdir || dvp == ndp->ni_topdir || dvp == rootvnode || pr != NULL) { fpl->tvp = dvp; fpl->tvp_seqc = vn_seqc_read_any(dvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_aborted(fpl)); } return (0); } if ((dvp->v_vflag & VV_ROOT) != 0) { /* * TODO * The opposite of climb mount is needed here. */ return (cache_fpl_partial(fpl)); } ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); if (ncp == NULL) { return (cache_fpl_aborted(fpl)); } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { if ((nc_flag & NCF_NEGATIVE) != 0) return (cache_fpl_aborted(fpl)); fpl->tvp = ncp->nc_vp; } else { fpl->tvp = ncp->nc_dvp; } fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } /* * Acquire fence provided by vn_seqc_read_any above. */ if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { return (cache_fpl_aborted(fpl)); } if (!cache_ncp_canuse(ncp)) { return (cache_fpl_aborted(fpl)); } counter_u64_add(dotdothits, 1); return (0); } static int __noinline cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) { u_char nc_flag __diagused; bool neg_promote; #ifdef INVARIANTS nc_flag = atomic_load_char(&ncp->nc_flag); MPASS((nc_flag & NCF_NEGATIVE) != 0); #endif /* * If they want to create an entry we need to replace this one. */ if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { fpl->tvp = NULL; return (cache_fplookup_modifying(fpl)); } neg_promote = cache_neg_hit_prep(ncp); if (!cache_fpl_neg_ncp_canuse(ncp)) { cache_neg_hit_abort(ncp); return (cache_fpl_partial(fpl)); } if (neg_promote) { return (cache_fplookup_negative_promote(fpl, ncp, hash)); } cache_neg_hit_finish(ncp); cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOENT)); } /* * Resolve a symlink. Called by filesystem-specific routines. * * Code flow is: * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve */ int cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) { struct nameidata *ndp; struct componentname *cnp; size_t adjust; ndp = fpl->ndp; cnp = fpl->cnp; if (__predict_false(len == 0)) { return (ENOENT); } if (__predict_false(len > MAXPATHLEN - 2)) { if (cache_fpl_istrailingslash(fpl)) { return (EAGAIN); } } ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; #ifdef INVARIANTS if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { return (ENAMETOOLONG); } if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { return (ELOOP); } adjust = len; if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); } else { if (cache_fpl_istrailingslash(fpl)) { adjust = len + 1; cnp->cn_pnbuf[len] = '/'; cnp->cn_pnbuf[len + 1] = '\0'; } else { cnp->cn_pnbuf[len] = '\0'; } } bcopy(string, cnp->cn_pnbuf, len); ndp->ni_pathlen += adjust; cache_fpl_pathlen_add(fpl, adjust); cnp->cn_nameptr = cnp->cn_pnbuf; fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; fpl->tvp = NULL; return (0); } static int __noinline cache_fplookup_symlink(struct cache_fpl *fpl) { struct mount *mp; struct nameidata *ndp; struct componentname *cnp; struct vnode *dvp, *tvp; int error; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; tvp = fpl->tvp; if (cache_fpl_islastcn(ndp)) { if ((cnp->cn_flags & FOLLOW) == 0) { return (cache_fplookup_final(fpl)); } } mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } /* * Note this check races against setting the flag just like regular * lookup. */ if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EACCES)); } error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); if (__predict_false(error != 0)) { switch (error) { case EAGAIN: return (cache_fpl_partial(fpl)); case ENOENT: case ENAMETOOLONG: case ELOOP: cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, error)); default: return (cache_fpl_aborted(fpl)); } } if (*(cnp->cn_nameptr) == '/') { fpl->dvp = cache_fpl_handle_root(fpl); fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); if (seqc_in_modify(fpl->dvp_seqc)) { return (cache_fpl_aborted(fpl)); } /* * The main loop assumes that ->dvp points to a vnode belonging * to a filesystem which can do lockless lookup, but the absolute * symlink can be wandering off to one which does not. */ mp = atomic_load_ptr(&fpl->dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } if (!cache_fplookup_mp_supported(mp)) { cache_fpl_checkpoint(fpl); return (cache_fpl_partial(fpl)); } } return (0); } static int cache_fplookup_next(struct cache_fpl *fpl) { struct componentname *cnp; struct namecache *ncp; struct vnode *dvp, *tvp; u_char nc_flag; uint32_t hash; int error; cnp = fpl->cnp; dvp = fpl->dvp; hash = fpl->hash; if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) { return (cache_fplookup_dot(fpl)); } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { return (cache_fplookup_dotdot(fpl)); } } MPASS(!cache_fpl_isdotdot(cnp)); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { return (cache_fplookup_noentry(fpl)); } tvp = atomic_load_ptr(&ncp->nc_vp); nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) != 0) { return (cache_fplookup_neg(fpl, ncp, hash)); } if (!cache_ncp_canuse(ncp)) { return (cache_fpl_partial(fpl)); } fpl->tvp = tvp; fpl->tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } counter_u64_add(numposhits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); error = 0; if (cache_fplookup_is_mp(fpl)) { error = cache_fplookup_cross_mount(fpl); } return (error); } static bool cache_fplookup_mp_supported(struct mount *mp) { MPASS(mp != NULL); if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) return (false); return (true); } /* * Walk up the mount stack (if any). * * Correctness is provided in the following ways: * - all vnodes are protected from freeing with SMR * - struct mount objects are type stable making them always safe to access * - stability of the particular mount is provided by busying it * - relationship between the vnode which is mounted on and the mount is * verified with the vnode sequence counter after busying * - association between root vnode of the mount and the mount is protected * by busy * * From that point on we can read the sequence counter of the root vnode * and get the next mount on the stack (if any) using the same protection. * * By the end of successful walk we are guaranteed the reached state was * indeed present at least at some point which matches the regular lookup. */ static int __noinline cache_fplookup_climb_mount(struct cache_fpl *fpl) { struct mount *mp, *prev_mp; struct mount_pcpu *mpcpu, *prev_mpcpu; struct vnode *vp; seqc_t vp_seqc; vp = fpl->tvp; vp_seqc = fpl->tvp_seqc; VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp == NULL)) { return (0); } prev_mp = NULL; for (;;) { if (!vfs_op_thread_enter_crit(mp, mpcpu)) { if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); return (cache_fpl_partial(fpl)); } if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); if (!vn_seqc_consistent(vp, vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } if (!cache_fplookup_mp_supported(mp)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } prev_mp = mp; prev_mpcpu = mpcpu; mp = atomic_load_ptr(&vp->v_mountedhere); if (mp == NULL) break; } vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); fpl->tvp = vp; fpl->tvp_seqc = vp_seqc; return (0); } static int __noinline cache_fplookup_cross_mount(struct cache_fpl *fpl) { struct mount *mp; struct mount_pcpu *mpcpu; struct vnode *vp; seqc_t vp_seqc; vp = fpl->tvp; vp_seqc = fpl->tvp_seqc; VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp == NULL)) { return (0); } if (!vfs_op_thread_enter_crit(mp, mpcpu)) { return (cache_fpl_partial(fpl)); } if (!vn_seqc_consistent(vp, vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } if (!cache_fplookup_mp_supported(mp)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp = atomic_load_ptr(&mp->mnt_rootvnode); if (__predict_false(vp == NULL)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp_seqc = vn_seqc_read_any(vp); vfs_op_thread_exit_crit(mp, mpcpu); if (seqc_in_modify(vp_seqc)) { return (cache_fpl_partial(fpl)); } mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp != NULL)) { /* * There are possibly more mount points on top. * Normally this does not happen so for simplicity just start * over. */ return (cache_fplookup_climb_mount(fpl)); } fpl->tvp = vp; fpl->tvp_seqc = vp_seqc; return (0); } /* * Check if a vnode is mounted on. */ static bool cache_fplookup_is_mp(struct cache_fpl *fpl) { struct vnode *vp; vp = fpl->tvp; return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); } /* * Parse the path. * * The code was originally copy-pasted from regular lookup and despite * clean ups leaves performance on the table. Any modifications here * must take into account that in case off fallback the resulting * nameidata state has to be compatible with the original. */ /* * Debug ni_pathlen tracking. */ #ifdef INVARIANTS static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) { fpl->debug.ni_pathlen += n; KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); } static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) { fpl->debug.ni_pathlen -= n; KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); } static void cache_fpl_pathlen_inc(struct cache_fpl *fpl) { cache_fpl_pathlen_add(fpl, 1); } static void cache_fpl_pathlen_dec(struct cache_fpl *fpl) { cache_fpl_pathlen_sub(fpl, 1); } #else static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) { } static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) { } static void cache_fpl_pathlen_inc(struct cache_fpl *fpl) { } static void cache_fpl_pathlen_dec(struct cache_fpl *fpl) { } #endif static void cache_fplookup_parse(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct vnode *dvp; char *cp; uint32_t hash; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; /* * Find the end of this path component, it is either / or nul. * * Store / as a temporary sentinel so that we only have one character * to test for. Pathnames tend to be short so this should not be * resulting in cache misses. * * TODO: fix this to be word-sized. */ MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf); KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], fpl->nulchar, cnp->cn_pnbuf)); KASSERT(*fpl->nulchar == '\0', ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, cnp->cn_pnbuf)); hash = cache_get_hash_iter_start(dvp); *fpl->nulchar = '/'; for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { KASSERT(*cp != '\0', ("%s: encountered unexpected nul; string [%s]\n", __func__, cnp->cn_nameptr)); hash = cache_get_hash_iter(*cp, hash); continue; } *fpl->nulchar = '\0'; fpl->hash = cache_get_hash_iter_finish(hash); cnp->cn_namelen = cp - cnp->cn_nameptr; cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); #ifdef INVARIANTS /* * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since * we are going to fail this lookup with ENAMETOOLONG (see below). */ if (cnp->cn_namelen <= NAME_MAX) { if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { panic("%s: mismatched hash for [%s] len %ld", __func__, cnp->cn_nameptr, cnp->cn_namelen); } } #endif /* * Hack: we have to check if the found path component's length exceeds * NAME_MAX. However, the condition is very rarely true and check can * be elided in the common case -- if an entry was found in the cache, * then it could not have been too long to begin with. */ ndp->ni_next = cp; } static void cache_fplookup_parse_advance(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; cnp->cn_nameptr = ndp->ni_next; KASSERT(*(cnp->cn_nameptr) == '/', ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } /* * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. * * Lockless lookup tries to elide checking for spurious slashes and should they * be present is guaranteed to fail to find an entry. In this case the caller * must check if the name starts with a slash and call this routine. It is * going to fast forward across the spurious slashes and set the state up for * retry. */ static int __noinline cache_fplookup_skip_slashes(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; MPASS(*(cnp->cn_nameptr) == '/'); do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); /* * Go back to one slash so that cache_fplookup_parse_advance has * something to skip. */ cnp->cn_nameptr--; cache_fpl_pathlen_inc(fpl); /* * cache_fplookup_parse_advance starts from ndp->ni_next */ ndp->ni_next = cnp->cn_nameptr; /* * See cache_fplookup_dot. */ fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; return (0); } /* * Handle trailing slashes (e.g., "foo/"). * * If a trailing slash is found the terminal vnode must be a directory. * Regular lookup shortens the path by nulifying the first trailing slash and * sets the TRAILINGSLASH flag to denote this took place. There are several * checks on it performed later. * * Similarly to spurious slashes, lockless lookup handles this in a speculative * manner relying on an invariant that a non-directory vnode will get a miss. * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. * * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" * and denotes this is the last path component, which avoids looping back. * * Only plain lookups are supported for now to restrict corner cases to handle. */ static int __noinline cache_fplookup_trailingslash(struct cache_fpl *fpl) { #ifdef INVARIANTS size_t ni_pathlen; #endif struct nameidata *ndp; struct componentname *cnp; struct namecache *ncp; struct vnode *tvp; char *cn_nameptr_orig, *cn_nameptr_slash; seqc_t tvp_seqc; u_char nc_flag; ndp = fpl->ndp; cnp = fpl->cnp; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; MPASS(fpl->dvp == fpl->tvp); KASSERT(cache_fpl_istrailingslash(fpl), ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, cnp->cn_pnbuf)); KASSERT(cnp->cn_nameptr[0] == '\0', ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], cnp->cn_pnbuf)); KASSERT(cnp->cn_namelen == 0, ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, cnp->cn_pnbuf)); MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); if (cnp->cn_nameiop != LOOKUP) { return (cache_fpl_aborted(fpl)); } if (__predict_false(tvp->v_type != VDIR)) { if (!vn_seqc_consistent(tvp, tvp_seqc)) { return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } /* * Denote the last component. */ ndp->ni_next = &cnp->cn_nameptr[0]; MPASS(cache_fpl_islastcn(ndp)); /* * Unwind trailing slashes. */ cn_nameptr_orig = cnp->cn_nameptr; while (cnp->cn_nameptr >= cnp->cn_pnbuf) { cnp->cn_nameptr--; if (cnp->cn_nameptr[0] != '/') { break; } } /* * Unwind to the beginning of the path component. * * Note the path may or may not have started with a slash. */ cn_nameptr_slash = cnp->cn_nameptr; while (cnp->cn_nameptr > cnp->cn_pnbuf) { cnp->cn_nameptr--; if (cnp->cn_nameptr[0] == '/') { break; } } if (cnp->cn_nameptr[0] == '/') { cnp->cn_nameptr++; } cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); cache_fpl_checkpoint(fpl); #ifdef INVARIANTS ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; if (ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif /* * If this was a "./" lookup the parent directory is already correct. */ if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { return (0); } /* * Otherwise we need to look it up. */ tvp = fpl->tvp; ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); if (__predict_false(ncp == NULL)) { return (cache_fpl_aborted(fpl)); } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { return (cache_fpl_aborted(fpl)); } fpl->dvp = ncp->nc_dvp; fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); if (seqc_in_modify(fpl->dvp_seqc)) { return (cache_fpl_aborted(fpl)); } return (0); } /* * See the API contract for VOP_FPLOOKUP_VEXEC. */ static int __noinline cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) { struct componentname *cnp; struct vnode *dvp; seqc_t dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; /* * Hack: delayed empty path checking. */ if (cnp->cn_pnbuf[0] == '\0') { return (cache_fplookup_emptypath(fpl)); } /* * TODO: Due to ignoring trailing slashes lookup will perform a * permission check on the last dir when it should not be doing it. It * may fail, but said failure should be ignored. It is possible to fix * it up fully without resorting to regular lookup, but for now just * abort. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_aborted(fpl)); } /* * Hack: delayed degenerate path checking. */ if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { return (cache_fplookup_degenerate(fpl)); } /* * Hack: delayed name len checking. */ if (__predict_false(cnp->cn_namelen > NAME_MAX)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); } /* * Hack: they may be looking up foo/bar, where foo is not a directory. * In such a case we need to return ENOTDIR, but we may happen to get * here with a different error. */ if (dvp->v_type != VDIR) { error = ENOTDIR; } /* * Hack: handle O_SEARCH. * * Open Group Base Specifications Issue 7, 2018 edition states: * * If the access mode of the open file description associated with the * file descriptor is not O_SEARCH, the function shall check whether * directory searches are permitted using the current permissions of * the directory underlying the file descriptor. If the access mode is * O_SEARCH, the function shall not perform the check. * * * Regular lookup tests for the NOEXECCHECK flag for every path * component to decide whether to do the permission check. However, * since most lookups never have the flag (and when they do it is only * present for the first path component), lockless lookup only acts on * it if there is a permission problem. Here the flag is represented * with a boolean so that we don't have to clear it on the way out. * * For simplicity this always aborts. * TODO: check if this is the first lookup and ignore the permission * problem. Note the flag has to survive fallback (if it happens to be * performed). */ if (fpl->fsearch) { return (cache_fpl_aborted(fpl)); } switch (error) { case EAGAIN: if (!vn_seqc_consistent(dvp, dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_partial(fpl); } break; default: if (!vn_seqc_consistent(dvp, dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_smr_exit(fpl); cache_fpl_handled_error(fpl, error); } break; } return (error); } static int cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct mount *mp; int error; ndp = fpl->ndp; cnp = fpl->cnp; cache_fpl_checkpoint(fpl); /* * The vnode at hand is almost always stable, skip checking for it. * Worst case this postpones the check towards the end of the iteration * of the main loop. */ fpl->dvp = dvp; fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { return (cache_fpl_aborted(fpl)); } MPASS(fpl->tvp == NULL); for (;;) { cache_fplookup_parse(fpl); error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); if (__predict_false(error != 0)) { error = cache_fplookup_failed_vexec(fpl, error); break; } error = cache_fplookup_next(fpl); if (__predict_false(cache_fpl_terminated(fpl))) { break; } VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); if (fpl->tvp->v_type == VLNK) { error = cache_fplookup_symlink(fpl); if (cache_fpl_terminated(fpl)) { break; } } else { if (cache_fpl_islastcn(ndp)) { error = cache_fplookup_final(fpl); break; } if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { error = cache_fpl_aborted(fpl); break; } fpl->dvp = fpl->tvp; fpl->dvp_seqc = fpl->tvp_seqc; cache_fplookup_parse_advance(fpl); } cache_fpl_checkpoint(fpl); } return (error); } /* * Fast path lookup protected with SMR and sequence counters. * * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. * * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria * outlined below. * * Traditional vnode lookup conceptually looks like this: * * vn_lock(current); * for (;;) { * next = find(); * vn_lock(next); * vn_unlock(current); * current = next; * if (last) * break; * } * return (current); * * Each jump to the next vnode is safe memory-wise and atomic with respect to * any modifications thanks to holding respective locks. * * The same guarantee can be provided with a combination of safe memory * reclamation and sequence counters instead. If all operations which affect * the relationship between the current vnode and the one we are looking for * also modify the counter, we can verify whether all the conditions held as * we made the jump. This includes things like permissions, mount points etc. * Counter modification is provided by enclosing relevant places in * vn_seqc_write_begin()/end() calls. * * Thus this translates to: * * vfs_smr_enter(); * dvp_seqc = seqc_read_any(dvp); * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode * abort(); * for (;;) { * tvp = find(); * tvp_seqc = seqc_read_any(tvp); * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode * abort(); * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode * abort(); * dvp = tvp; // we know nothing of importance has changed * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration * if (last) * break; * } * vget(); // secure the vnode * if (!seqc_consistent(tvp, tvp_seqc) // final check * abort(); * // at this point we know nothing has changed for any parent<->child pair * // as they were crossed during the lookup, meaning we matched the guarantee * // of the locked variant * return (tvp); * * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: * - they are called while within vfs_smr protection which they must never exit * - EAGAIN can be returned to denote checking could not be performed, it is * always valid to return it * - if the sequence counter has not changed the result must be valid * - if the sequence counter has changed both false positives and false negatives * are permitted (since the result will be rejected later) * - for simple cases of unix permission checks vaccess_vexec_smr can be used * * Caveats to watch out for: * - vnodes are passed unlocked and unreferenced with nothing stopping * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised * to use atomic_load_ptr to fetch it. * - the aforementioned object can also get freed, meaning absent other means it * should be protected with vfs_smr * - either safely checking permissions as they are modified or guaranteeing * their stability is left to the routine */ int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, struct pwd **pwdp) { struct cache_fpl fpl; struct pwd *pwd; struct vnode *dvp; struct componentname *cnp; int error; fpl.status = CACHE_FPL_STATUS_UNSET; fpl.in_smr = false; fpl.ndp = ndp; fpl.cnp = cnp = &ndp->ni_cnd; MPASS(ndp->ni_lcf == 0); KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, ("%s: internal flags found in cn_flags %" PRIx64, __func__, cnp->cn_flags)); if ((cnp->cn_flags & SAVESTART) != 0) { MPASS(cnp->cn_nameiop != LOOKUP); } MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); if (__predict_false(!cache_can_fplookup(&fpl))) { *status = fpl.status; SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); return (EOPNOTSUPP); } cache_fpl_checkpoint_outer(&fpl); cache_fpl_smr_enter_initial(&fpl); #ifdef INVARIANTS fpl.debug.ni_pathlen = ndp->ni_pathlen; #endif fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; fpl.fsearch = false; - fpl.savename = (cnp->cn_flags & SAVENAME) != 0; fpl.tvp = NULL; /* for degenerate path handling */ fpl.pwd = pwdp; pwd = pwd_get_smr(); *(fpl.pwd) = pwd; ndp->ni_rootdir = pwd->pwd_rdir; ndp->ni_topdir = pwd->pwd_jdir; if (cnp->cn_pnbuf[0] == '/') { dvp = cache_fpl_handle_root(&fpl); MPASS(ndp->ni_resflags == 0); ndp->ni_resflags = NIRES_ABS; } else { if (ndp->ni_dirfd == AT_FDCWD) { dvp = pwd->pwd_cdir; } else { error = cache_fplookup_dirfd(&fpl, &dvp); if (__predict_false(error != 0)) { goto out; } } } SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); error = cache_fplookup_impl(dvp, &fpl); out: cache_fpl_smr_assert_not_entered(&fpl); cache_fpl_assert_status(&fpl); *status = fpl.status; if (SDT_PROBES_ENABLED()) { SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); if (fpl.status == CACHE_FPL_STATUS_HANDLED) SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, ndp); } if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { MPASS(error != CACHE_FPL_FAILED); if (error != 0) { + cache_fpl_cleanup_cnp(fpl.cnp); MPASS(fpl.dvp == NULL); MPASS(fpl.tvp == NULL); - MPASS(fpl.savename == false); } ndp->ni_dvp = fpl.dvp; ndp->ni_vp = fpl.tvp; - if (fpl.savename) { - cnp->cn_flags |= HASBUF; - } else { - cache_fpl_cleanup_cnp(cnp); - } } return (error); } diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index 79c7fd8365fa..7fac64ff1b38 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -1,1810 +1,1705 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef INVARIANTS #include #endif #include #include #include #define NAMEI_DIAGNOSTIC 1 #undef NAMEI_DIAGNOSTIC #ifdef INVARIANTS -static void NDVALIDATE(struct nameidata *); -#else -#define NDVALIDATE(ndp) do { } while (0) +static void NDVALIDATE_impl(struct nameidata *, int); #endif +#define NDVALIDATE(ndp) NDVALIDATE_impl(ndp, __LINE__) + + SDT_PROVIDER_DEFINE(vfs); SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *", "unsigned long", "bool"); SDT_PROBE_DEFINE4(vfs, namei, lookup, return, "int", "struct vnode *", "bool", "struct nameidata"); /* Allocation zone for namei. */ uma_zone_t namei_zone; /* Placeholder vnode for mp traversal. */ static struct vnode *vp_crossmp; static int crossmp_vop_islocked(struct vop_islocked_args *ap) { return (LK_SHARED); } static int crossmp_vop_lock1(struct vop_lock1_args *ap) { struct vnode *vp; struct lock *lk __diagused; const char *file __witness_used; int flags, line __witness_used; vp = ap->a_vp; lk = vp->v_vnlock; flags = ap->a_flags; file = ap->a_file; line = ap->a_line; if ((flags & LK_SHARED) == 0) panic("invalid lock request for crossmp"); WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, flags & LK_INTERLOCK ? &VI_MTX(vp)->lock_object : NULL); WITNESS_LOCK(&lk->lock_object, 0, file, line); if ((flags & LK_INTERLOCK) != 0) VI_UNLOCK(vp); LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, ap->a_line); return (0); } static int crossmp_vop_unlock(struct vop_unlock_args *ap) { struct vnode *vp; struct lock *lk __diagused; vp = ap->a_vp; lk = vp->v_vnlock; WITNESS_UNLOCK(&lk->lock_object, 0, LOCK_FILE, LOCK_LINE); LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE, LOCK_LINE); return (0); } static struct vop_vector crossmp_vnodeops = { .vop_default = &default_vnodeops, .vop_islocked = crossmp_vop_islocked, .vop_lock1 = crossmp_vop_lock1, .vop_unlock = crossmp_vop_unlock, }; /* * VFS_VOP_VECTOR_REGISTER(crossmp_vnodeops) is not used here since the vnode * gets allocated early. See nameiinit for the direct call below. */ struct nameicap_tracker { struct vnode *dp; TAILQ_ENTRY(nameicap_tracker) nm_link; }; /* Zone for cap mode tracker elements used for dotdot capability checks. */ MALLOC_DEFINE(M_NAMEITRACKER, "namei_tracker", "namei tracking for dotdot"); static void nameiinit(void *dummy __unused) { namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vfs_vector_op_register(&crossmp_vnodeops); getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL); static int lookup_cap_dotdot = 1; SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN, &lookup_cap_dotdot, 0, "enables \"..\" components in path lookup in capability mode"); static int lookup_cap_dotdot_nonlocal = 1; SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN, &lookup_cap_dotdot_nonlocal, 0, "enables \"..\" components in path lookup in capability mode " "on non-local mount"); static void nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp) { struct nameicap_tracker *nt; if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR) return; nt = TAILQ_LAST(&ndp->ni_cap_tracker, nameicap_tracker_head); if (nt != NULL && nt->dp == dp) return; nt = malloc(sizeof(*nt), M_NAMEITRACKER, M_WAITOK); vhold(dp); nt->dp = dp; TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link); } static void nameicap_cleanup_from(struct nameidata *ndp, struct nameicap_tracker *first) { struct nameicap_tracker *nt, *nt1; nt = first; TAILQ_FOREACH_FROM_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) { TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link); vdrop(nt->dp); free(nt, M_NAMEITRACKER); } } static void nameicap_cleanup(struct nameidata *ndp) { KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) || (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative")); nameicap_cleanup_from(ndp, NULL); } /* * For dotdot lookups in capability mode, only allow the component * lookup to succeed if the resulting directory was already traversed * during the operation. This catches situations where already * traversed directory is moved to different parent, and then we walk * over it with dotdots. * * Also allow to force failure of dotdot lookups for non-local * filesystems, where external agents might assist local lookups to * escape the compartment. */ static int nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp) { struct nameicap_tracker *nt; struct mount *mp; if (dp == NULL || dp->v_type != VDIR || (ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0) return (0); if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0) return (ENOTCAPABLE); mp = dp->v_mount; if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL && (mp->mnt_flag & MNT_LOCAL) == 0) return (ENOTCAPABLE); TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head, nm_link) { if (dp == nt->dp) { nt = TAILQ_NEXT(nt, nm_link); if (nt != NULL) nameicap_cleanup_from(ndp, nt); return (0); } } return (ENOTCAPABLE); } static void namei_cleanup_cnp(struct componentname *cnp) { uma_zfree(namei_zone, cnp->cn_pnbuf); -#ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; -#endif } static int namei_handle_root(struct nameidata *ndp, struct vnode **dpp) { struct componentname *cnp; cnp = &ndp->ni_cnd; if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif return (ENOTCAPABLE); } while (*(cnp->cn_nameptr) == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } *dpp = ndp->ni_rootdir; vrefact(*dpp); return (0); } static int namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp) { struct componentname *cnp; struct thread *td; struct pwd *pwd; int error; bool startdir_used; cnp = &ndp->ni_cnd; td = curthread; startdir_used = false; *pwdp = NULL; *dpp = NULL; #ifdef CAPABILITY_MODE /* * In capability mode, lookups must be restricted to happen in * the subtree with the root specified by the file descriptor: * - The root must be real file descriptor, not the pseudo-descriptor * AT_FDCWD. * - The passed path must be relative and not absolute. * - If lookup_cap_dotdot is disabled, path must not contain the * '..' components. * - If lookup_cap_dotdot is enabled, we verify that all '..' * components lookups result in the directories which were * previously walked by us, which prevents an escape from * the relative root. */ if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) { ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; ndp->ni_resflags |= NIRES_STRICTREL; if (ndp->ni_dirfd == AT_FDCWD) { #ifdef KTRACE if (KTRPOINT(td, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif return (ECAPMODE); } } #endif error = 0; /* * Get starting point for the translation. */ pwd = pwd_hold(td); /* * The reference on ni_rootdir is acquired in the block below to avoid * back-to-back atomics for absolute lookups. */ ndp->ni_rootdir = pwd->pwd_rdir; ndp->ni_topdir = pwd->pwd_jdir; if (cnp->cn_pnbuf[0] == '/') { ndp->ni_resflags |= NIRES_ABS; error = namei_handle_root(ndp, dpp); } else { if (ndp->ni_startdir != NULL) { *dpp = ndp->ni_startdir; startdir_used = true; } else if (ndp->ni_dirfd == AT_FDCWD) { *dpp = pwd->pwd_cdir; vrefact(*dpp); } else { if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_ATFD1(ndp->ni_dirfd); if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_ATFD2(ndp->ni_dirfd); error = fgetvp_lookup(ndp->ni_dirfd, ndp, dpp); } if (error == 0 && (*dpp)->v_type != VDIR && (cnp->cn_pnbuf[0] != '\0' || (cnp->cn_flags & EMPTYPATH) == 0)) error = ENOTDIR; } if (error == 0 && (cnp->cn_flags & RBENEATH) != 0) { if (cnp->cn_pnbuf[0] == '/') { error = ENOTCAPABLE; } else if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0) { ndp->ni_lcf |= NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT; } } /* * If we are auditing the kernel pathname, save the user pathname. */ if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf); if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf); if (ndp->ni_startdir != NULL && !startdir_used) vrele(ndp->ni_startdir); if (error != 0) { if (*dpp != NULL) vrele(*dpp); pwd_drop(pwd); return (error); } if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 && lookup_cap_dotdot != 0) ndp->ni_lcf |= NI_LCF_CAP_DOTDOT; SDT_PROBE4(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf, cnp->cn_flags, false); *pwdp = pwd; return (0); } static int namei_getpath(struct nameidata *ndp) { struct componentname *cnp; int error; cnp = &ndp->ni_cnd; /* * Get a buffer for the name to be translated, and copy the * name into the buffer. */ cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); if (ndp->ni_segflg == UIO_SYSSPACE) { error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, &ndp->ni_pathlen); } else { error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, &ndp->ni_pathlen); } return (error); } static int namei_emptypath(struct nameidata *ndp) { struct componentname *cnp; struct pwd *pwd; struct vnode *dp; int error; cnp = &ndp->ni_cnd; MPASS(*cnp->cn_pnbuf == '\0'); MPASS((cnp->cn_flags & EMPTYPATH) != 0); MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); ndp->ni_resflags |= NIRES_EMPTYPATH; error = namei_setup(ndp, &dp, &pwd); if (error != 0) { - namei_cleanup_cnp(cnp); goto errout; } /* * Usecount on dp already provided by namei_setup. */ ndp->ni_vp = dp; - namei_cleanup_cnp(cnp); pwd_drop(pwd); NDVALIDATE(ndp); if ((cnp->cn_flags & LOCKLEAF) != 0) { VOP_LOCK(dp, (cnp->cn_flags & LOCKSHARED) != 0 ? LK_SHARED : LK_EXCLUSIVE); if (VN_IS_DOOMED(dp)) { vput(dp); error = ENOENT; goto errout; } } SDT_PROBE4(vfs, namei, lookup, return, 0, ndp->ni_vp, false, ndp); return (0); errout: SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp); + namei_cleanup_cnp(cnp); return (error); } static int __noinline namei_follow_link(struct nameidata *ndp) { char *cp; struct iovec aiov; struct uio auio; struct componentname *cnp; struct thread *td; int error, linklen; error = 0; cnp = &ndp->ni_cnd; td = curthread; if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; goto out; } #ifdef MAC if ((cnp->cn_flags & NOMACCHECK) == 0) { error = mac_vnode_check_readlink(td->td_ucred, ndp->ni_vp); if (error != 0) goto out; } #endif if (ndp->ni_pathlen > 1) cp = uma_zalloc(namei_zone, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error != 0) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); goto out; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); error = ENOENT; goto out; } if (linklen + ndp->ni_pathlen > MAXPATHLEN) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); error = ENAMETOOLONG; goto out; } if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; out: return (error); } /* * Convert a pathname into a pointer to a locked vnode. * * The FOLLOW flag is set when symbolic links are to be followed * when they occur at the end of the name translation process. * Symbolic links are always followed for all other pathname * components other than the last. * * The segflg defines whether the name is to be copied from user * space or kernel space. * * Overall outline of namei: * * copy in name * get starting directory * while (!done && !error) { * call lookup to search path. * if symbolic link, massage name in buffer and continue * } */ int namei(struct nameidata *ndp) { struct vnode *dp; /* the directory we are searching */ struct componentname *cnp; struct thread *td; struct pwd *pwd; int error; enum cache_fpl_status status; cnp = &ndp->ni_cnd; td = curthread; #ifdef INVARIANTS KASSERT((ndp->ni_debugflags & NAMEI_DBG_CALLED) == 0, ("%s: repeated call to namei without NDREINIT", __func__)); KASSERT(ndp->ni_debugflags == NAMEI_DBG_INITED, ("%s: bad debugflags %d", __func__, ndp->ni_debugflags)); ndp->ni_debugflags |= NAMEI_DBG_CALLED; if (ndp->ni_startdir != NULL) ndp->ni_debugflags |= NAMEI_DBG_HADSTARTDIR; if (cnp->cn_flags & FAILIFEXISTS) { KASSERT(cnp->cn_nameiop == CREATE, ("%s: FAILIFEXISTS passed for op %d", __func__, cnp->cn_nameiop)); /* * The limitation below is to restrict hairy corner cases. */ KASSERT((cnp->cn_flags & (LOCKPARENT | LOCKLEAF)) == LOCKPARENT, ("%s: FAILIFEXISTS must be passed with LOCKPARENT and without LOCKLEAF", __func__)); } - /* - * For NDVALIDATE. - * - * While NDINIT may seem like a more natural place to do it, there are - * callers which directly modify flags past invoking init. - */ - cnp->cn_origflags = cnp->cn_flags; #endif ndp->ni_cnd.cn_cred = td->td_ucred; KASSERT(ndp->ni_resflags == 0, ("%s: garbage in ni_resflags: %x\n", __func__, ndp->ni_resflags)); KASSERT(cnp->cn_cred && td->td_proc, ("namei: bad cred/proc")); KASSERT((cnp->cn_flags & NAMEI_INTERNAL_FLAGS) == 0, ("namei: unexpected flags: %" PRIx64 "\n", cnp->cn_flags & NAMEI_INTERNAL_FLAGS)); if (cnp->cn_flags & NOCACHE) KASSERT(cnp->cn_nameiop != LOOKUP, ("%s: NOCACHE passed with LOOKUP", __func__)); MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR || ndp->ni_startdir->v_type == VBAD); ndp->ni_lcf = 0; ndp->ni_loopcnt = 0; ndp->ni_vp = NULL; error = namei_getpath(ndp); if (__predict_false(error != 0)) { namei_cleanup_cnp(cnp); SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp); return (error); } cnp->cn_nameptr = cnp->cn_pnbuf; #ifdef KTRACE if (KTRPOINT(td, KTR_NAMEI)) { ktrnamei(cnp->cn_pnbuf); } #endif TSNAMEI(curthread->td_proc->p_pid, cnp->cn_pnbuf); /* * First try looking up the target without locking any vnodes. * * We may need to start from scratch or pick up where it left off. */ error = cache_fplookup(ndp, &status, &pwd); switch (status) { case CACHE_FPL_STATUS_UNSET: __assert_unreachable(); break; case CACHE_FPL_STATUS_HANDLED: if (error == 0) NDVALIDATE(ndp); return (error); case CACHE_FPL_STATUS_PARTIAL: TAILQ_INIT(&ndp->ni_cap_tracker); dp = ndp->ni_startdir; break; case CACHE_FPL_STATUS_DESTROYED: ndp->ni_loopcnt = 0; error = namei_getpath(ndp); if (__predict_false(error != 0)) { namei_cleanup_cnp(cnp); return (error); } cnp->cn_nameptr = cnp->cn_pnbuf; /* FALLTHROUGH */ case CACHE_FPL_STATUS_ABORTED: TAILQ_INIT(&ndp->ni_cap_tracker); MPASS(ndp->ni_lcf == 0); if (*cnp->cn_pnbuf == '\0') { if ((cnp->cn_flags & EMPTYPATH) != 0) { return (namei_emptypath(ndp)); } namei_cleanup_cnp(cnp); SDT_PROBE4(vfs, namei, lookup, return, ENOENT, NULL, false, ndp); return (ENOENT); } error = namei_setup(ndp, &dp, &pwd); if (error != 0) { namei_cleanup_cnp(cnp); return (error); } break; } /* * Locked lookup. */ for (;;) { ndp->ni_startdir = dp; error = vfs_lookup(ndp); if (error != 0) goto out; /* * If not a symbolic link, we're done. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, false, ndp); - if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) { - namei_cleanup_cnp(cnp); - } else - cnp->cn_flags |= HASBUF; nameicap_cleanup(ndp); pwd_drop(pwd); NDVALIDATE(ndp); return (0); } error = namei_follow_link(ndp); if (error != 0) break; vput(ndp->ni_vp); dp = ndp->ni_dvp; /* * Check if root directory should replace current directory. */ cnp->cn_nameptr = cnp->cn_pnbuf; if (*(cnp->cn_nameptr) == '/') { vrele(dp); error = namei_handle_root(ndp, &dp); if (error != 0) goto out; } } vput(ndp->ni_vp); ndp->ni_vp = NULL; vrele(ndp->ni_dvp); out: MPASS(error != 0); SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp); namei_cleanup_cnp(cnp); nameicap_cleanup(ndp); pwd_drop(pwd); return (error); } static int compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags) { if (mp == NULL || ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) { lkflags &= ~LK_SHARED; lkflags |= LK_EXCLUSIVE; } lkflags |= LK_NODDLKTREAT; return (lkflags); } static __inline int needs_exclusive_leaf(struct mount *mp, int flags) { /* * Intermediate nodes can use shared locks, we only need to * force an exclusive lock for leaf nodes. */ if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF)) return (0); /* Always use exclusive locks if LOCKSHARED isn't set. */ if (!(flags & LOCKSHARED)) return (1); /* * For lookups during open(), if the mount point supports * extended shared operations, then use a shared lock for the * leaf node, otherwise use an exclusive lock. */ if ((flags & ISOPEN) != 0) return (!MNT_EXTENDED_SHARED(mp)); /* * Lookup requests outside of open() that specify LOCKSHARED * only need a shared lock on the leaf vnode. */ return (0); } /* * Various filesystems expect to be able to copy a name component with length * bounded by NAME_MAX into a directory entry buffer of size MAXNAMLEN. Make * sure that these are the same size. */ _Static_assert(MAXNAMLEN == NAME_MAX, "MAXNAMLEN and NAME_MAX have different values"); static int __noinline vfs_lookup_degenerate(struct nameidata *ndp, struct vnode *dp, int wantparent) { struct componentname *cnp; struct mount *mp; int error; cnp = &ndp->ni_cnd; cnp->cn_flags |= ISLASTCN; mp = atomic_load_ptr(&dp->v_mount); if (needs_exclusive_leaf(mp, cnp->cn_flags)) { cnp->cn_lkflags &= ~LK_SHARED; cnp->cn_lkflags |= LK_EXCLUSIVE; } vn_lock(dp, compute_cn_lkflags(mp, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); if (dp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (cnp->cn_nameiop != LOOKUP) { error = EISDIR; goto bad; } if (wantparent) { ndp->ni_dvp = dp; VREF(dp); } ndp->ni_vp = dp; cnp->cn_namelen = 0; if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_VNODE1(dp); else if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_VNODE2(dp); if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) VOP_UNLOCK(dp); /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); return (0); bad: VOP_UNLOCK(dp); return (error); } /* * FAILIFEXISTS handling. * * XXX namei called with LOCKPARENT but not LOCKLEAF has the strange * behaviour of leaving the vnode unlocked if the target is the same * vnode as the parent. */ static int __noinline vfs_lookup_failifexists(struct nameidata *ndp) { struct componentname *cnp __diagused; cnp = &ndp->ni_cnd; MPASS((cnp->cn_flags & ISSYMLINK) == 0); if (ndp->ni_vp == ndp->ni_dvp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); vrele(ndp->ni_vp); ndp->ni_dvp = NULL; ndp->ni_vp = NULL; NDFREE_PNBUF(ndp); return (EEXIST); } /* * Search a pathname. * This is a very central and rather complicated routine. * * The pathname is pointed to by ni_ptr and is of length ni_pathlen. * The starting directory is taken from ni_startdir. The pathname is * descended until done, or a symbolic link is encountered. The variable * ni_more is clear if the path is completed; it is set to one if a * symbolic link needing interpretation is encountered. * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it, the parent directory is returned * locked. If flag has WANTPARENT or'ed into it, the parent directory is * returned unlocked. Otherwise the parent directory is not returned. If * the target of the pathname exists and LOCKLEAF is or'ed into the flag * the target is returned locked, otherwise it is returned unlocked. * When creating or renaming and LOCKPARENT is specified, the target may not * be ".". When deleting and LOCKPARENT is specified, the target may be ".". * * Overall outline of lookup: * * dirloop: * identify next component of name at ndp->ni_ptr * handle degenerate case where name is null string * if .. and crossing mount points and on mounted filesys, find parent * call VOP_LOOKUP routine for next component name * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set * component vnode returned in ni_vp (if it exists), locked. * if result vnode is mounted on and crossing mount points, * find mounted on vnode * if more components of name, do next level at dirloop * return the answer in ni_vp, locked if LOCKLEAF set * if LOCKPARENT set, return locked parent in ni_dvp * if WANTPARENT set, return unlocked parent in ni_dvp */ int vfs_lookup(struct nameidata *ndp) { char *cp; /* pointer into pathname argument */ char *prev_ni_next; /* saved ndp->ni_next */ char *nulchar; /* location of '\0' in cn_pnbuf */ char *lastchar; /* location of the last character */ struct vnode *dp = NULL; /* the directory we are searching */ struct vnode *tdp; /* saved dp */ struct mount *mp; /* mount table entry */ struct prison *pr; size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */ int docache; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int error = 0; int dpunlocked = 0; /* dp has already been unlocked */ int relookup = 0; /* do not consume the path component */ struct componentname *cnp = &ndp->ni_cnd; int lkflags_save; int ni_dvp_unlocked; /* * Setup: break out flag bits into variables. */ ni_dvp_unlocked = 0; wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); KASSERT(cnp->cn_nameiop == LOOKUP || wantparent, ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT.")); /* * When set to zero, docache causes the last component of the * pathname to be deleted from the cache and the full lookup * of the name to be done (via VOP_CACHEDLOOKUP()). Often * filesystems need some pre-computed values that are made * during the full lookup, for instance UFS sets dp->i_offset. * * The docache variable is set to zero when requested by the * NOCACHE flag and for all modifying operations except CREATE. */ docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || (wantparent && cnp->cn_nameiop != CREATE && cnp->cn_nameiop != LOOKUP)) docache = 0; rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; ndp->ni_dvp = NULL; cnp->cn_lkflags = LK_SHARED; dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; /* * Leading slashes, if any, are supposed to be skipped by the caller. */ MPASS(cnp->cn_nameptr[0] != '/'); /* * Check for degenerate name (e.g. / or "") which is a way of talking * about a directory, e.g. like "/." or ".". */ if (__predict_false(cnp->cn_nameptr[0] == '\0')) { error = vfs_lookup_degenerate(ndp, dp, wantparent); if (error == 0) goto success_right_lock; goto bad_unlocked; } /* * Nul-out trailing slashes (e.g., "foo///" -> "foo"). * * This must be done before VOP_LOOKUP() because some fs's don't know * about trailing slashes. Remember if there were trailing slashes to * handle symlinks, existing non-directories and non-existing files * that won't be directories specially later. */ MPASS(ndp->ni_pathlen >= 2); lastchar = &cnp->cn_nameptr[ndp->ni_pathlen - 2]; if (*lastchar == '/') { while (lastchar >= cnp->cn_pnbuf) { *lastchar = '\0'; lastchar--; ndp->ni_pathlen--; if (*lastchar != '/') { break; } } cnp->cn_flags |= TRAILINGSLASH; } /* * We use shared locks until we hit the parent of the last cn then * we adjust based on the requesting flags. */ vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); dirloop: /* * Search a new directory. * * The last component of the filename is left accessible via - * cnp->cn_nameptr for callers that need the name. Callers needing - * the name set the SAVENAME flag. When done, they assume - * responsibility for freeing the pathname buffer. + * cnp->cn_nameptr. It has to be freed with a call to NDFREE*. * * Store / as a temporary sentinel so that we only have one character * to test for. Pathnames tend to be short so this should not be * resulting in cache misses. */ nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; KASSERT(*nulchar == '\0', ("%s: expected nul at %p; string [%s]\n", __func__, nulchar, cnp->cn_pnbuf)); *nulchar = '/'; for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { KASSERT(*cp != '\0', ("%s: encountered unexpected nul; string [%s]\n", __func__, cnp->cn_nameptr)); continue; } *nulchar = '\0'; cnp->cn_namelen = cp - cnp->cn_nameptr; if (__predict_false(cnp->cn_namelen > NAME_MAX)) { error = ENAMETOOLONG; goto bad; } #ifdef NAMEI_DIAGNOSTIC { char c = *cp; *cp = '\0'; printf("{%s}: ", cnp->cn_nameptr); *cp = c; } #endif prev_ni_pathlen = ndp->ni_pathlen; ndp->ni_pathlen -= cnp->cn_namelen; KASSERT(ndp->ni_pathlen <= PATH_MAX, ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); prev_ni_next = ndp->ni_next; ndp->ni_next = cp; /* * Something else should be clearing this. */ cnp->cn_flags &= ~(ISDOTDOT|ISLASTCN); cnp->cn_flags |= MAKEENTRY; if (*cp == '\0' && docache == 0) cnp->cn_flags &= ~MAKEENTRY; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT; if (*ndp->ni_next == 0) { cnp->cn_flags |= ISLASTCN; if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))) { error = EINVAL; goto bad; } } nameicap_tracker_add(ndp, dp); /* * Make sure degenerate names don't get here, their handling was * previously found in this spot. */ MPASS(cnp->cn_nameptr[0] != '\0'); /* * Handle "..": five special cases. * 0. If doing a capability lookup and lookup_cap_dotdot is * disabled, return ENOTCAPABLE. * 1. Return an error if this is the last component of * the name and the operation is DELETE or RENAME. * 2. If at root directory (e.g. after chroot) * or at absolute root directory * then ignore it so can't get out. * 3. If this vnode is the root of a mounted * filesystem, then replace it with the * vnode which was mounted on so we take the * .. in the other filesystem. * 4. If the vnode is the top directory of * the jail or chroot, don't let them out. * 5. If doing a capability lookup and lookup_cap_dotdot is * enabled, return ENOTCAPABLE if the lookup would escape * from the initial file descriptor directory. Checks are * done by ensuring that namei() already traversed the * result of dotdot lookup. */ if (cnp->cn_flags & ISDOTDOT) { if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT)) == NI_LCF_STRICTRELATIVE) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif error = ENOTCAPABLE; goto bad; } if ((cnp->cn_flags & ISLASTCN) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EINVAL; goto bad; } for (;;) { for (pr = cnp->cn_cred->cr_prison; pr != NULL; pr = pr->pr_parent) if (dp == pr->pr_root) break; if (dp == ndp->ni_rootdir || dp == ndp->ni_topdir || dp == rootvnode || pr != NULL || ((dp->v_vflag & VV_ROOT) != 0 && (cnp->cn_flags & NOCROSSMOUNT) != 0)) { ndp->ni_dvp = dp; ndp->ni_vp = dp; VREF(dp); goto nextname; } if ((dp->v_vflag & VV_ROOT) == 0) break; if (VN_IS_DOOMED(dp)) { /* forced unmount */ error = ENOENT; goto bad; } tdp = dp; dp = dp->v_mount->mnt_vnodecovered; VREF(dp); vput(tdp); vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, ISDOTDOT)); error = nameicap_check_dotdot(ndp, dp); if (error != 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif goto bad; } } } /* * We now have a segment name to search for, and a directory to search. */ unionlookup: #ifdef MAC error = mac_vnode_check_lookup(cnp->cn_cred, dp, cnp); if (__predict_false(error)) goto bad; #endif ndp->ni_dvp = dp; ndp->ni_vp = NULL; ASSERT_VOP_LOCKED(dp, "lookup"); /* * If we have a shared lock we may need to upgrade the lock for the * last operation. */ if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) && dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED) vn_lock(dp, LK_UPGRADE|LK_RETRY); if (VN_IS_DOOMED(dp)) { error = ENOENT; goto bad; } /* * If we're looking up the last component and we need an exclusive * lock, adjust our lkflags. */ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags)) cnp->cn_lkflags = LK_EXCLUSIVE; #ifdef NAMEI_DIAGNOSTIC vn_printf(dp, "lookup in "); #endif lkflags_save = cnp->cn_lkflags; cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags, cnp->cn_flags); error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp); cnp->cn_lkflags = lkflags_save; if (error != 0) { KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); #ifdef NAMEI_DIAGNOSTIC printf("not found\n"); #endif if ((error == ENOENT) && (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) && (dp->v_mount->mnt_flag & MNT_UNION)) { tdp = dp; dp = dp->v_mount->mnt_vnodecovered; VREF(dp); vput(tdp); vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); nameicap_tracker_add(ndp, dp); goto unionlookup; } if (error == ERELOOKUP) { vref(dp); ndp->ni_vp = dp; error = 0; relookup = 1; goto good; } if (error != EJUSTRETURN) goto bad; /* * At this point, we know we're at the end of the * pathname. If creating / renaming, we can consider * allowing the file or directory to be created / renamed, * provided we're not on a read-only filesystem. */ if (rdonly) { error = EROFS; goto bad; } /* trailing slash only allowed for directories */ if ((cnp->cn_flags & TRAILINGSLASH) && !(cnp->cn_flags & WILLBEDIR)) { error = ENOENT; goto bad; } if ((cnp->cn_flags & LOCKPARENT) == 0) VOP_UNLOCK(dp); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory vnode in ndp->ni_dvp. */ if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } goto success; } good: #ifdef NAMEI_DIAGNOSTIC printf("found\n"); #endif dp = ndp->ni_vp; /* * Check to see if the vnode has been mounted on; * if so find the root of the mounted filesystem. */ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && (cnp->cn_flags & NOCROSSMOUNT) == 0) { if (vfs_busy(mp, 0)) continue; vput(dp); if (dp != ndp->ni_dvp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); vrefact(vp_crossmp); ndp->ni_dvp = vp_crossmp; error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags, cnp->cn_flags), &tdp); vfs_unbusy(mp); if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) panic("vp_crossmp exclusively locked or reclaimed"); if (error) { dpunlocked = 1; goto bad2; } ndp->ni_vp = dp = tdp; } /* * Check for symbolic link */ if ((dp->v_type == VLNK) && ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) || *ndp->ni_next == '/')) { cnp->cn_flags |= ISSYMLINK; if (VN_IS_DOOMED(dp)) { /* * We can't know whether the directory was mounted with * NOSYMFOLLOW, so we can't follow safely. */ error = ENOENT; goto bad2; } if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) { error = EACCES; goto bad2; } /* * Symlink code always expects an unlocked dvp. */ if (ndp->ni_dvp != ndp->ni_vp) { VOP_UNLOCK(ndp->ni_dvp); ni_dvp_unlocked = 1; } goto success; } nextname: /* * Not a symbolic link that we will follow. Continue with the * next component if there is any; otherwise, we're done. */ KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/', ("lookup: invalid path state.")); if (relookup) { relookup = 0; ndp->ni_pathlen = prev_ni_pathlen; ndp->ni_next = prev_ni_next; if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); goto dirloop; } if (cnp->cn_flags & ISDOTDOT) { error = nameicap_check_dotdot(ndp, ndp->ni_vp); if (error != 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif goto bad2; } } if (*ndp->ni_next == '/') { cnp->cn_nameptr = ndp->ni_next; while (*cnp->cn_nameptr == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); goto dirloop; } /* * If we're processing a path with a trailing slash, * check that the end result is a directory. */ if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) { error = ENOTDIR; goto bad2; } /* * Disallow directory write attempts on read-only filesystems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad2; } if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } if (!wantparent) { ni_dvp_unlocked = 2; if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) { VOP_UNLOCK(ndp->ni_dvp); ni_dvp_unlocked = 1; } if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_VNODE1(dp); else if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_VNODE2(dp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp); success: /* * FIXME: for lookups which only cross a mount point to fetch the * root vnode, ni_dvp will be set to vp_crossmp. This can be a problem * if either WANTPARENT or LOCKPARENT is set. */ /* * Because of shared lookup we may have the vnode shared locked, but * the caller may want it to be exclusively locked. */ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) && VOP_ISLOCKED(dp) != LK_EXCLUSIVE) { vn_lock(dp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(dp)) { error = ENOENT; goto bad2; } } success_right_lock: if (ndp->ni_vp != NULL) { if ((cnp->cn_flags & ISDOTDOT) == 0) nameicap_tracker_add(ndp, ndp->ni_vp); if ((cnp->cn_flags & (FAILIFEXISTS | ISSYMLINK)) == FAILIFEXISTS) return (vfs_lookup_failifexists(ndp)); } return (0); bad2: if (ni_dvp_unlocked != 2) { if (dp != ndp->ni_dvp && !ni_dvp_unlocked) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); } bad: if (!dpunlocked) vput(dp); bad_unlocked: ndp->ni_vp = NULL; return (error); } /* * relookup - lookup a path name component * Used by lookup to re-acquire things. */ int vfs_relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct vnode *dp = NULL; /* the directory we are searching */ int rdonly; /* lookup read-only flag bit */ int error = 0; KASSERT(cnp->cn_flags & ISLASTCN, ("relookup: Not given last component.")); /* * Setup: break out flag bits into variables. */ KASSERT((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) != 0, ("relookup: parent not wanted")); rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; dp = dvp; cnp->cn_lkflags = LK_EXCLUSIVE; vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); /* * Search a new directory. * - * The last component of the filename is left accessible via - * cnp->cn_nameptr for callers that need the name. Callers needing - * the name set the SAVENAME flag. When done, they assume - * responsibility for freeing the pathname buffer. + * See a comment in vfs_lookup for cnp->cn_nameptr. */ #ifdef NAMEI_DIAGNOSTIC printf("{%s}: ", cnp->cn_nameptr); #endif /* * Check for "" which represents the root directory after slash * removal. */ if (cnp->cn_nameptr[0] == '\0') { /* * Support only LOOKUP for "/" because lookup() * can't succeed for CREATE, DELETE and RENAME. */ KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP")); KASSERT(dp->v_type == VDIR, ("dp is not a directory")); if (!(cnp->cn_flags & LOCKLEAF)) VOP_UNLOCK(dp); *vpp = dp; /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); return (0); } if (cnp->cn_flags & ISDOTDOT) panic ("relookup: lookup on dot-dot"); /* * We now have a segment name to search for, and a directory to search. */ #ifdef NAMEI_DIAGNOSTIC vn_printf(dp, "search in "); #endif if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) { KASSERT(*vpp == NULL, ("leaf should be empty")); if (error != EJUSTRETURN) goto bad; /* * If creating and at end of pathname, then can consider * allowing file to be created. */ if (rdonly) { error = EROFS; goto bad; } /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if ((cnp->cn_flags & LOCKPARENT) == 0) VOP_UNLOCK(dp); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory vnode in ndp->ni_dvp. */ return (0); } dp = *vpp; /* * Disallow directory write attempts on read-only filesystems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { if (dvp == dp) vrele(dvp); else vput(dvp); error = EROFS; goto bad; } /* * Set the parent lock/ref state to the requested state. */ if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) VOP_UNLOCK(dvp); /* * Check for symbolic link */ KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW), ("relookup: symlink found.\n")); /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp); return (0); bad: vput(dp); *vpp = NULL; return (error); } -/* - * Free data allocated by namei(); see namei(9) for details. - */ -void -NDFREE_PNBUF(struct nameidata *ndp) -{ - - if ((ndp->ni_cnd.cn_flags & HASBUF) != 0) { - MPASS((ndp->ni_cnd.cn_flags & (SAVENAME | SAVESTART)) != 0); - uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); - ndp->ni_cnd.cn_flags &= ~HASBUF; - } -} - -/* - * NDFREE_PNBUF replacement for callers that know there is no buffer. - * - * This is a hack. Preferably the VFS layer would not produce anything more - * than it was asked to do. Unfortunately several non-LOOKUP cases can add the - * HASBUF flag to the result. Even then an interface could be implemented where - * the caller specifies what they expected to see in the result and what they - * are going to take care of. - * - * In the meantime provide this kludge as a trivial replacement for NDFREE_PNBUF - * calls scattered throughout the kernel where we know for a fact the flag must not - * be seen. - */ -#ifdef INVARIANTS -void -NDFREE_NOTHING(struct nameidata *ndp) -{ - struct componentname *cnp; - - cnp = &ndp->ni_cnd; - KASSERT(cnp->cn_nameiop == LOOKUP, ("%s: got non-LOOKUP op %d\n", - __func__, cnp->cn_nameiop)); - KASSERT((cnp->cn_flags & (SAVENAME | HASBUF)) == 0, - ("%s: bad flags \%" PRIx64 "\n", __func__, cnp->cn_flags)); -} -#endif - void (NDFREE)(struct nameidata *ndp, const u_int flags) { int unlock_dvp; int unlock_vp; unlock_dvp = 0; unlock_vp = 0; if (!(flags & NDF_NO_FREE_PNBUF)) { NDFREE_PNBUF(ndp); } if (!(flags & NDF_NO_VP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) unlock_vp = 1; if (!(flags & NDF_NO_DVP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKPARENT) && ndp->ni_dvp != ndp->ni_vp) unlock_dvp = 1; if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) { if (unlock_vp) { vput(ndp->ni_vp); unlock_vp = 0; } else vrele(ndp->ni_vp); ndp->ni_vp = NULL; } if (unlock_vp) VOP_UNLOCK(ndp->ni_vp); if (!(flags & NDF_NO_DVP_RELE) && (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { if (unlock_dvp) { vput(ndp->ni_dvp); unlock_dvp = 0; } else vrele(ndp->ni_dvp); ndp->ni_dvp = NULL; } if (unlock_dvp) VOP_UNLOCK(ndp->ni_dvp); if (!(flags & NDF_NO_STARTDIR_RELE) && (ndp->ni_cnd.cn_flags & SAVESTART)) { vrele(ndp->ni_startdir); ndp->ni_startdir = NULL; } } #ifdef INVARIANTS /* * Validate the final state of ndp after the lookup. - * - * Historically filesystems were allowed to modify cn_flags. Most notably they - * can add SAVENAME to the request, resulting in HASBUF and pushing subsequent - * clean up to the consumer. In practice this seems to only concern != LOOKUP - * operations. - * - * As a step towards stricter API contract this routine validates the state to - * clean up. Note validation is a work in progress with the intent of becoming - * stricter over time. */ -#define NDMODIFYINGFLAGS (LOCKLEAF | LOCKPARENT | WANTPARENT | SAVENAME | SAVESTART | HASBUF) static void -NDVALIDATE(struct nameidata *ndp) +NDVALIDATE_impl(struct nameidata *ndp, int line) { struct componentname *cnp; - uint64_t used, orig; cnp = &ndp->ni_cnd; - orig = cnp->cn_origflags; - used = cnp->cn_flags; - switch (cnp->cn_nameiop) { - case LOOKUP: - /* - * For plain lookup we require strict conformance -- nothing - * to clean up if it was not requested by the caller. - */ - orig &= NDMODIFYINGFLAGS; - used &= NDMODIFYINGFLAGS; - if ((orig & (SAVENAME | SAVESTART)) != 0) - orig |= HASBUF; - if (orig != used) { - goto out_mismatch; - } - break; - case CREATE: - case DELETE: - case RENAME: - /* - * Some filesystems set SAVENAME to provoke HASBUF, accommodate - * for it until it gets fixed. - */ - orig &= NDMODIFYINGFLAGS; - orig |= (SAVENAME | HASBUF); - used &= NDMODIFYINGFLAGS; - used |= (SAVENAME | HASBUF); - if (orig != used) { - goto out_mismatch; - } - break; - } - return; -out_mismatch: - panic("%s: mismatched flags for op %d: added %" PRIx64 ", " - "removed %" PRIx64" (%" PRIx64" != %" PRIx64"; stored %" PRIx64" != %" PRIx64")", - __func__, cnp->cn_nameiop, used & ~orig, orig &~ used, - orig, used, cnp->cn_origflags, cnp->cn_flags); + if (cnp->cn_pnbuf == NULL) + panic("%s: got no buf! called from %d", __func__, line); } + #endif /* * Determine if there is a suitable alternate filename under the specified * prefix for the specified path. If the create flag is set, then the * alternate prefix will be used so long as the parent directory exists. * This is used by the various compatibility ABIs so that Linux binaries prefer * files under /compat/linux for example. The chosen path (whether under * the prefix or under /) is returned in a kernel malloc'd buffer pointed * to by pathbuf. The caller is responsible for free'ing the buffer from * the M_TEMP bucket if one is returned. */ int kern_alternate_path(const char *prefix, const char *path, enum uio_seg pathseg, char **pathbuf, int create, int dirfd) { struct nameidata nd, ndroot; char *ptr, *buf, *cp; size_t len, sz; int error; buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK); *pathbuf = buf; /* Copy the prefix into the new pathname as a starting point. */ len = strlcpy(buf, prefix, MAXPATHLEN); if (len >= MAXPATHLEN) { *pathbuf = NULL; free(buf, M_TEMP); return (EINVAL); } sz = MAXPATHLEN - len; ptr = buf + len; /* Append the filename to the prefix. */ if (pathseg == UIO_SYSSPACE) error = copystr(path, ptr, sz, &len); else error = copyinstr(path, ptr, sz, &len); if (error) { *pathbuf = NULL; free(buf, M_TEMP); return (error); } /* Only use a prefix with absolute pathnames. */ if (*ptr != '/') { error = EINVAL; goto keeporig; } if (dirfd != AT_FDCWD) { /* * We want the original because the "prefix" is * included in the already opened dirfd. */ bcopy(ptr, buf, len); return (0); } /* * We know that there is a / somewhere in this pathname. * Search backwards for it, to find the file's parent dir * to see if it exists in the alternate tree. If it does, * and we want to create a file (cflag is set). We don't * need to worry about the root comparison in this case. */ if (create) { for (cp = &ptr[len] - 1; *cp != '/'; cp--); *cp = '\0'; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf); error = namei(&nd); *cp = '/'; if (error != 0) goto keeporig; } else { NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf); error = namei(&nd); if (error != 0) goto keeporig; /* * We now compare the vnode of the prefix to the one * vnode asked. If they resolve to be the same, then we * ignore the match so that the real root gets used. * This avoids the problem of traversing "../.." to find the * root directory and never finding it, because "/" resolves * to the emulation root directory. This is expensive :-( */ NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix); /* We shouldn't ever get an error from this namei(). */ error = namei(&ndroot); if (error == 0) { if (nd.ni_vp == ndroot.ni_vp) error = ENOENT; NDFREE_PNBUF(&ndroot); vrele(ndroot.ni_vp); } } NDFREE_PNBUF(&nd); vrele(nd.ni_vp); keeporig: /* If there was an error, use the original path name. */ if (error) bcopy(ptr, buf, len); return (error); } diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 62430f4aaf39..2386e5a08169 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,7075 +1,7075 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void v_init_counters(struct vnode *); static void vn_seqc_init(struct vnode *); static void vn_seqc_write_end_free(struct vnode *vp); static void vgonel(struct vnode *); static bool vhold_recycle_free(struct vnode *); static void vdropl_recycle(struct vnode *vp); static void vdrop_recycle(struct vnode *vp); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_lock(void *arg, int what); static void destroy_vpollinfo(struct vpollinfo *vi); static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn); static void vnlru_recalc(void); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. */ static u_long __exclusive_cache_line numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); static counter_u64_t vnodes_created; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, "Number of vnodes created by getnewvnode"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of allocates vnodes in the system. */ static TAILQ_HEAD(freelst, vnode) vnode_list; static struct vnode *vnode_list_free_marker; static struct vnode *vnode_list_reclaim_marker; /* * "Free" vnode target. Free vnodes are rarely completely free, but are * just ones that are cheap to recycle. Usually they are for files which * have been stat'd but not read; these usually have inode and namecache * data attached to them. This target is the preferred minimum size of a * sub-cache consisting mostly of such files. The system balances the size * of this sub-cache with its complement to try to prevent either from * thrashing while the other is relatively inactive. The targets express * a preference for the best balance. * * "Above" this target there are 2 further targets (watermarks) related * to recyling of free vnodes. In the best-operating case, the cache is * exactly full, the free list has size between vlowat and vhiwat above the * free target, and recycling from it and normal use maintains this state. * Sometimes the free list is below vlowat or even empty, but this state * is even better for immediate use provided the cache is not full. * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free * ones) to reach one of these states. The watermarks are currently hard- * coded as 4% and 9% of the available space higher. These and the default * of 25% for wantfreevnodes are too large if the memory size is large. * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim * whenever vnlru_proc() becomes active. */ static long wantfreevnodes; static long __exclusive_cache_line freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of \"free\" vnodes"); static long freevnodes_old; static counter_u64_t recycles_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, "Number of vnodes recycled to meet vnode cache targets"); static counter_u64_t recycles_free_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, "Number of free vnodes recycled to meet vnode cache targets"); static counter_u64_t deferred_inact; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, "Number of times inactive processing was deferred"); /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_list * numvnodes * freevnodes */ static struct mtx __exclusive_cache_line vnode_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; static smr_t buf_trie_smr; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); __read_frequently smr_t vfs_smr; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); #define VDBATCH_SIZE 8 struct vdbatch { u_int index; long freevnodes; struct mtx lock; struct vnode *tab[VDBATCH_SIZE]; }; DPCPU_DEFINE_STATIC(struct vdbatch, vd); static void vdbatch_dequeue(struct vnode *vp); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* Target for maximum number of vnodes. */ u_long desiredvnodes; static u_long gapvnodes; /* gap between wanted and desired */ static u_long vhiwat; /* enough extras after expansion */ static u_long vlowat; /* minimal extras before expansion */ static u_long vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static u_long vnlru_read_freevnodes(void); /* * Note that no attempt is made to sanitize these parameters. */ static int sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = desiredvnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == desiredvnodes) return (0); mtx_lock(&vnode_list_mtx); desiredvnodes = val; wantfreevnodes = desiredvnodes / 4; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); /* * XXX There is no protection against multiple threads changing * desiredvnodes at the same time. Locking above only helps vnlru and * getnewvnode. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); return (0); } SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, "LU", "Target for maximum number of vnodes"); static int sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = wantfreevnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == wantfreevnodes) return (0); mtx_lock(&vnode_list_mtx); wantfreevnodes = val; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); return (0); } SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, "LU", "Target for minimum number of \"free\" vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); static int sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct vnode *vp; struct nameidata nd; char *buf; unsigned long ndflags; int error; if (req->newptr == NULL) return (EINVAL); if (req->newlen >= PATH_MAX) return (E2BIG); buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) goto out; buf[req->newlen] = '\0'; - ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME; + ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; if (VN_IS_DOOMED(vp)) { /* * This vnode is being recycled. Return != 0 to let the caller * know that the sysctl had no effect. Return EAGAIN because a * subsequent call will likely succeed (since namei will create * a new vnode if necessary) */ error = EAGAIN; goto putvnode; } counter_u64_add(recycles_count, 1); vgone(vp); putvnode: NDFREE(&nd, 0); out: free(buf, M_TEMP); return (error); } static int sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct vnode *vp; struct file *fp; int error; int fd; if (req->newptr == NULL) return (EBADF); error = sysctl_handle_int(oidp, &fd, 0, req); if (error != 0) return (error); error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) goto drop; counter_u64_add(recycles_count, 1); vgone(vp); VOP_UNLOCK(vp); drop: fdrop(fp, td); return (error); } SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_ftry_reclaim_vnode, "I", "Try to reclaim a vnode by its file descriptor"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ #define vnsz2log 8 #ifndef DEBUG_LOCKS _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && sizeof(struct vnode) < 1UL << (vnsz2log + 1), "vnsz2log needs to be updated"); #endif /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree_smr(buf_trie_zone, node); } PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, buf_trie_smr); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of the memory size in KB to vnodes approaches 64:1. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ #endif static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static struct vnode * vn_alloc_marker(struct mount *mp) { struct vnode *vp; vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); vp->v_type = VMARKER; vp->v_mount = mp; return (vp); } static void vn_free_marker(struct vnode *vp) { MPASS(vp->v_type == VMARKER); free(vp, M_VNODE_MARKER); } #ifdef KASAN static int vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) { kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); return (0); } static void vnode_dtor(void *mem, int size, void *arg __unused) { size_t end1, end2, off1, off2; _Static_assert(offsetof(struct vnode, v_vnodelist) < offsetof(struct vnode, v_dbatchcpu), "KASAN marks require updating"); off1 = offsetof(struct vnode, v_vnodelist); off2 = offsetof(struct vnode, v_dbatchcpu); end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); /* * Access to the v_vnodelist and v_dbatchcpu fields are permitted even * after the vnode has been freed. Try to get some KASAN coverage by * marking everything except those two fields as invalid. Because * KASAN's tracking is not byte-granular, any preceding fields sharing * the same 8-byte aligned word must also be marked valid. */ /* Handle the area from the start until v_vnodelist... */ off1 = rounddown2(off1, KASAN_SHADOW_SCALE); kasan_mark(mem, off1, off1, KASAN_UMA_FREED); /* ... then the area between v_vnodelist and v_dbatchcpu ... */ off1 = roundup2(end1, KASAN_SHADOW_SCALE); off2 = rounddown2(off2, KASAN_SHADOW_SCALE); if (off2 > off1) kasan_mark((void *)((char *)mem + off1), off2 - off1, off2 - off1, KASAN_UMA_FREED); /* ... and finally the area from v_dbatchcpu to the end. */ off2 = roundup2(end2, KASAN_SHADOW_SCALE); kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, KASAN_UMA_FREED); } #endif /* KASAN */ /* * Initialize a vnode as it first enters the zone. */ static int vnode_init(void *mem, int size, int flags) { struct vnode *vp; vp = mem; bzero(vp, size); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems opt-in. */ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bufobj_init(&vp->v_bufobj, vp); /* * Initialize namecache. */ cache_vnode_init(vp); /* * Initialize rangelocks. */ rangelock_init(&vp->v_rl); vp->v_dbatchcpu = NOCPU; /* * Check vhold_recycle_free for an explanation. */ vp->v_holdcnt = VHOLD_NO_SMR; vp->v_type = VNON; mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (0); } /* * Free a vnode when it is cleared from the zone. */ static void vnode_fini(void *mem, int size) { struct vnode *vp; struct bufobj *bo; vp = mem; vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); bo = &vp->v_bufobj; rw_destroy(BO_LOCKPTR(bo)); kasan_mark(mem, size, size, 0); } /* * Provide the size of NFS nclnode and NFS fh for calculation of the * vnode memory consumption. The size is specified directly to * eliminate dependency on NFS-private header. * * Other filesystems may use bigger or smaller (like UFS and ZFS) * private inode data, but the NFS-based estimation is ample enough. * Still, we care about differences in the size between 64- and 32-bit * platforms. * * Namecache structure size is heuristically * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. */ #ifdef _LP64 #define NFS_NCLNODE_SZ (528 + 64) #define NC_SZ 148 #else #define NFS_NCLNODE_SZ (360 + 32) #define NC_SZ 92 #endif static void vntblinit(void *dummy __unused) { struct vdbatch *vd; uma_ctor ctor; uma_dtor dtor; int cpu, physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to the physical * memory size is 1:16 until desiredvnodes exceeds 98,304. * Thereafter, the * marginal ratio of desiredvnodes to the physical memory size is * 1:64. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * must not exceed 1/10th of the kernel's heap size. */ physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %lu -> %lu\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_list); mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); /* * The lock is taken to appease WITNESS. */ mtx_lock(&vnode_list_mtx); vnlru_recalc(); mtx_unlock(&vnode_list_mtx); vnode_list_free_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); #ifdef KASAN ctor = vnode_ctor; dtor = vnode_dtor; #else ctor = NULL; dtor = NULL; #endif vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); uma_zone_set_smr(vnode_zone, vfs_smr); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_SMR); buf_trie_smr = uma_zone_get_smr(buf_trie_zone); uma_prealloc(buf_trie_zone, nbuf); vnodes_created = counter_u64_alloc(M_WAITOK); recycles_count = counter_u64_alloc(M_WAITOK); recycles_free_count = counter_u64_alloc(M_WAITOK); deferred_inact = counter_u64_alloc(M_WAITOK); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); bzero(vd, sizeof(*vd)); mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * VOP_LOOKUP() obtains B while A is held * vfs_busy() obtains a shared lock on F while A and B are held * vput() releases lock on B * vput() releases lock on A * VFS_ROOT() obtains lock on D while shared lock on F is held * vfs_unbusy() releases shared lock on F * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. */ int vfs_busy(struct mount *mp, int flags) { struct mount_pcpu *mpcpu; MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); if (vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); vfs_mp_count_add_pcpu(mpcpu, ref, 1); vfs_mp_count_add_pcpu(mpcpu, lockref, 1); vfs_op_thread_exit(mp, mpcpu); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); return (0); } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller that mount mount it tried to busy is no longer * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("%s: non-empty upper mount list with pending unmount", __func__)); if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { struct mount_pcpu *mpcpu; int c; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); vfs_mp_count_sub_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_lockref; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; if (jailed(td->td_ucred)) { /* * If the jail of the calling thread lacks permission for * this type of file system, deny immediately. */ if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) return (EPERM); /* * If the file system was mounted outside the jail of the * calling thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Try to reduce the total number of vnodes. * * This routine (and its user) are buggy in at least the following ways: * - all parameters were picked years ago when RAM sizes were significantly * smaller * - it can pick vnodes based on pages used by the vm object, but filesystems * like ZFS don't use it making the pick broken * - since ZFS has its own aging policy it gets partially combated by this one * - a dedicated method should be provided for filesystems to let them decide * whether the vnode should be recycled * * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desirable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. * * @param reclaim_nc_src Only reclaim directories with outgoing namecache * entries if this argument is strue * @param trigger Only reclaim vnodes with fewer than this many resident * pages. * @param target How many vnodes to reclaim. * @return The number of vnodes that were reclaimed. */ static int vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) { struct vnode *vp, *mvp; struct mount *mp; struct vm_object *object; u_long done; bool retried; mtx_assert(&vnode_list_mtx, MA_OWNED); retried = false; done = 0; mvp = vnode_list_reclaim_marker; restart: vp = mvp; while (done < target) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) break; if (__predict_false(vp->v_type == VMARKER)) continue; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. * Also skip free vnodes. We are trying to make space * to expand the free list, not reduce it. */ if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) goto next_iter; if (vp->v_type == VBAD || vp->v_type == VNON) goto next_iter; object = atomic_load_ptr(&vp->v_object); if (object == NULL || object->resident_page_count > trigger) { goto next_iter; } /* * Handle races against vnode allocation. Filesystems lock the * vnode some time after it gets returned from getnewvnode, * despite type and hold count being manipulated earlier. * Resorting to checking v_mount restores guarantees present * before the global list was reworked to contain all vnodes. */ if (!VI_TRYLOCK(vp)) goto next_iter; if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { VI_UNLOCK(vp); goto next_iter; } if (vp->v_mount == NULL) { VI_UNLOCK(vp); goto next_iter; } vholdl(vp); VI_UNLOCK(vp); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop_recycle(vp); goto next_iter_unlocked; } if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { vdrop_recycle(vp); vn_finished_write(mp); goto next_iter_unlocked; } VI_LOCK(vp); if (vp->v_usecount > 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || (vp->v_object != NULL && vp->v_object->handle == vp && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(mp); goto next_iter_unlocked; } counter_u64_add(recycles_count, 1); vgonel(vp); VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(mp); done++; next_iter_unlocked: if (should_yield()) kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; next_iter: MPASS(vp->v_type != VMARKER); if (!should_yield()) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; } if (done == 0 && !retried) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); retried = true; goto restart; } return (done); } static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 0, "limit on vnode free requests per call to the vnlru_free routine"); /* * Attempt to reduce the free list by the requested amount. */ static int vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp) { struct vnode *vp; struct mount *mp; int ocount; mtx_assert(&vnode_list_mtx, MA_OWNED); if (count > max_vnlru_free) count = max_vnlru_free; ocount = count; vp = mvp; for (;;) { if (count == 0) { break; } vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); break; } if (__predict_false(vp->v_type == VMARKER)) continue; if (vp->v_holdcnt > 0) continue; /* * Don't recycle if our vnode is from different type * of mount point. Note that mp is type-safe, the * check does not reach unmapped address even if * vnode is reclaimed. */ if (mnt_op != NULL && (mp = vp->v_mount) != NULL && mp->mnt_op != mnt_op) { continue; } if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { continue; } if (!vhold_recycle_free(vp)) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); /* * FIXME: ignores the return value, meaning it may be nothing * got recycled but it claims otherwise to the caller. * * Originally the value started being ignored in 2005 with * 114a1006a8204aa156e1f9ad6476cdff89cada7f . * * Respecting the value can run into significant stalls if most * vnodes belong to one file system and it has writes * suspended. In presence of many threads and millions of * vnodes they keep contending on the vnode_list_mtx lock only * to find vnodes they can't recycle. * * The solution would be to pre-check if the vnode is likely to * be recycle-able, but it needs to happen with the * vnode_list_mtx lock held. This runs into a problem where * VOP_GETWRITEMOUNT (currently needed to find out about if * writes are frozen) can take locks which LOR against it. * * Check nullfs for one example (null_getwritemount). */ vtryrecycle(vp); count--; mtx_lock(&vnode_list_mtx); vp = mvp; } return (ocount - count); } static int vnlru_free_locked(int count) { mtx_assert(&vnode_list_mtx, MA_OWNED); return (vnlru_free_impl(count, NULL, vnode_list_free_marker)); } void vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) { MPASS(mnt_op != NULL); MPASS(mvp != NULL); VNPASS(mvp->v_type == VMARKER, mvp); mtx_lock(&vnode_list_mtx); vnlru_free_impl(count, mnt_op, mvp); mtx_unlock(&vnode_list_mtx); } struct vnode * vnlru_alloc_marker(void) { struct vnode *mvp; mvp = vn_alloc_marker(NULL); mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (mvp); } void vnlru_free_marker(struct vnode *mvp) { mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); vn_free_marker(mvp); } static void vnlru_recalc(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ vlowat = vhiwat / 2; } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; /* * The main freevnodes counter is only updated when threads requeue their vnode * batches. CPUs are conditionally walked to compute a more accurate total. * * Limit how much of a slop are we willing to tolerate. Note: the actual value * at any given moment can still exceed slop, but it should not be by significant * margin in practice. */ #define VNLRU_FREEVNODES_SLOP 128 static __inline void vfs_freevnodes_inc(void) { struct vdbatch *vd; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes++; critical_exit(); } static __inline void vfs_freevnodes_dec(void) { struct vdbatch *vd; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes--; critical_exit(); } static u_long vnlru_read_freevnodes(void) { struct vdbatch *vd; long slop; int cpu; mtx_assert(&vnode_list_mtx, MA_OWNED); if (freevnodes > freevnodes_old) slop = freevnodes - freevnodes_old; else slop = freevnodes_old - freevnodes; if (slop < VNLRU_FREEVNODES_SLOP) return (freevnodes >= 0 ? freevnodes : 0); freevnodes_old = freevnodes; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); freevnodes_old += vd->freevnodes; } return (freevnodes_old >= 0 ? freevnodes_old : 0); } static bool vnlru_under(u_long rnumvnodes, u_long limit) { u_long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = vnlru_read_freevnodes(); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static bool vnlru_under_unlocked(u_long rnumvnodes, u_long limit) { long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = atomic_load_long(&freevnodes); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static void vnlru_kick(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); if (vnlruproc_sig == 0) { vnlruproc_sig = 1; wakeup(vnlruproc); } } static void vnlru_proc(void) { u_long rnumvnodes, rfreevnodes, target; unsigned long onumvnodes; int done, force, trigger, usevnodes; bool reclaim_nc_src, want_reread; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); force = 0; want_reread = false; for (;;) { kproc_suspend_check(vnlruproc); mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (want_reread) { force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; want_reread = false; } /* * If numvnodes is too large (due to desiredvnodes being * adjusted using its sysctl, or emergency growth), first * try to reduce it by discarding from the free list. */ if (rnumvnodes > desiredvnodes) { vnlru_free_locked(rnumvnodes - desiredvnodes); rnumvnodes = atomic_load_long(&numvnodes); } /* * Sleep if the vnode cache is in a good state. This is * when it is not over-full and has space for about a 4% * or 9% expansion (by growing its size or inexcessively * reducing its free list). Otherwise, try to reclaim * space for a 10% expansion. */ if (vstir && force == 0) { force = 1; vstir = 0; } if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } rfreevnodes = vnlru_read_freevnodes(); onumvnodes = rnumvnodes; /* * Calculate parameters for recycling. These are the same * throughout the loop to give some semblance of fairness. * The trigger point is to avoid recycling vnodes with lots * of resident pages. We aren't trying to free memory; we * are trying to recycle or at least free vnodes. */ if (rnumvnodes <= desiredvnodes) usevnodes = rnumvnodes - rfreevnodes; else usevnodes = rnumvnodes; if (usevnodes <= 0) usevnodes = 1; /* * The trigger value is chosen to give a conservatively * large value to ensure that it alone doesn't prevent * making progress. The value can easily be so large that * it is effectively infinite in some congested and * misconfigured cases, and this is necessary. Normally * it is about 8 to 100 (pages), which is quite large. */ trigger = vm_cnt.v_page_count * 2 / usevnodes; if (force < 2) trigger = vsmalltrigger; reclaim_nc_src = force >= 3; target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); target = target / 10 + 1; done = vlrureclaim(reclaim_nc_src, trigger, target); mtx_unlock(&vnode_list_mtx); if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) uma_reclaim(UMA_RECLAIM_DRAIN); if (done == 0) { if (force == 0 || force == 1) { force = 2; continue; } if (force == 2) { force = 3; continue; } want_reread = true; force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else { want_reread = true; kern_yield(PRI_USER); } } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); vdrop_recycle(vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); vdrop_recycle(vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if (!VN_IS_DOOMED(vp)) { counter_u64_add(recycles_free_count, 1); vgonel(vp); } VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(vnmp); return (0); } /* * Allocate a new vnode. * * The operation never returns an error. Returning an error was disabled * in r145385 (dated 2005) with the following comment: * * XXX Not all VFS_VGET/ffs_vget callers check returns. * * Given the age of this commit (almost 15 years at the time of writing this * comment) restoring the ability to fail requires a significant audit of * all codepaths. * * The routine can try to free a vnode or stall for up to 1 second waiting for * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. */ static u_long vn_alloc_cyclecount; static struct vnode * __noinline vn_alloc_hard(struct mount *mp) { u_long rnumvnodes, rfreevnodes; mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (rnumvnodes + 1 < desiredvnodes) { vn_alloc_cyclecount = 0; goto alloc; } rfreevnodes = vnlru_read_freevnodes(); if (vn_alloc_cyclecount++ >= rfreevnodes) { vn_alloc_cyclecount = 0; vstir = 1; } /* * Grow the vnode cache if it will not be above its target max * after growing. Otherwise, if the free list is nonempty, try * to reclaim 1 item from it before growing the cache (possibly * above its target max if the reclamation failed or is delayed). * Otherwise, wait for some space. In all cases, schedule * vnlru_proc() if we are getting short of space. The watermarks * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ if (vnlru_free_locked(1) > 0) goto alloc; if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { /* * Wait for space for a new vnode. */ vnlru_kick(); msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && vnlru_read_freevnodes() > 1) vnlru_free_locked(1); } alloc: rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (vnlru_under(rnumvnodes, vlowat)) vnlru_kick(); mtx_unlock(&vnode_list_mtx); return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static struct vnode * vn_alloc(struct mount *mp) { u_long rnumvnodes; if (__predict_false(vn_alloc_cyclecount != 0)) return (vn_alloc_hard(mp)); rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { atomic_subtract_long(&numvnodes, 1); return (vn_alloc_hard(mp)); } return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static void vn_free(struct vnode *vp) { atomic_subtract_long(&numvnodes, 1); uma_zfree_smr(vnode_zone, vp); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct thread *td; struct lock_object *lo; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); KASSERT(vops->registered, ("%s: not registered vector op %p\n", __func__, vops)); td = curthread; if (td->td_vp_reserved != NULL) { vp = td->td_vp_reserved; td->td_vp_reserved = NULL; } else { vp = vn_alloc(mp); } counter_u64_add(vnodes_created, 1); /* * Locks are given the generic name "vnode" when created. * Follow the historic practice of using the filesystem * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. * * Locks live in a witness group keyed on their name. Thus, * when a lock is renamed, it must also move from the witness * group of its old name to the witness group of its new name. * * The change only needs to be made when the vnode moves * from one filesystem type to another. We ensure that each * filesystem use a single static name pointer for its tag so * that we can compare pointers rather than doing a strcmp(). */ lo = &vp->v_vnlock->lock_object; #ifdef WITNESS if (lo->lo_name != tag) { #endif lo->lo_name = tag; #ifdef WITNESS WITNESS_DESTROY(lo); WITNESS_INIT(lo, tag); } #endif /* * By default, don't allow shared locks unless filesystems opt-in. */ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; /* * Finalize various vnode identity bits. */ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); vp->v_type = VNON; vp->v_op = vops; vp->v_irflag = 0; v_init_counters(vp); vn_seqc_init(vp); vp->v_bufobj.bo_ops = &buf_ops_bio; #ifdef DIAGNOSTIC if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode(9), tag %s\n", tag); #endif #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); #endif if (mp != NULL) { vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; } /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } void getnewvnode_reserve(void) { struct thread *td; td = curthread; MPASS(td->td_vp_reserved == NULL); td->td_vp_reserved = vn_alloc(NULL); } void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; if (td->td_vp_reserved != NULL) { vn_free(td->td_vp_reserved); td->td_vp_reserved = NULL; } } static void __noinline freevnode(struct vnode *vp) { struct bufobj *bo; /* * The vnode has been marked for destruction, so free it. * * The vnode will be returned to the zone where it will * normally remain until it is needed for another vnode. We * need to cleanup (or verify that the cleanup has already * been done) any residual data left from its current use * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); /* * Paired with vgone. */ vn_seqc_write_end_free(vp); bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, ("Dangling rangelock waiters")); VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, ("Leaked inactivation")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); destroy_vpollinfo(vp->v_pollinfo); VOP_UNLOCK(vp); vp->v_pollinfo = NULL; } vp->v_mountedhere = NULL; vp->v_unpcb = NULL; vp->v_rdev = NULL; vp->v_fifoinfo = NULL; vp->v_iflag = 0; vp->v_vflag = 0; bo->bo_flag = 0; vn_free(vp); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); mp = vp->v_mount; MNT_ILOCK(mp); VI_LOCK(vp); vp->v_mount = NULL; VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); /* * The caller expects the interlock to be still held. */ ASSERT_VI_LOCKED(vp, __func__); } static int insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); } else { KASSERT(!dtr, ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", __func__)); } /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } /* * Insert into list of vnodes for the new mount point, if available. * insmntque() reclaims the vnode on insertion failure, insmntque1() * leaves handling of the vnode to the caller. */ int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1_int(vp, mp, true)); } int insmntque1(struct vnode *vp, struct mount *mp) { return (insmntque1_int(vp, mp, false)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); do { error = BO_SYNC(bo, MNT_WAIT); } while (error == ERELOOKUP); if (error != 0) return (error); BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); return (EBUSY); } } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { BO_UNLOCK(bo); vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); BO_LOCK(bo); } } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: flush dirty failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { /* * If we are flushing both V_NORMAL and V_ALT buffers then * do not skip any buffers. If we are flushing only V_NORMAL * buffers then skip buffers marked as BX_ALTDATA. If we are * flushing only V_ALT buffers then skip buffers not marked * as BX_ALTDATA. */ if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { continue; } if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp == NULL) break; nbp = gbincore(bo, lblkno); if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags) break; /* nbp invalid */ } return (retval); } int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) { struct buf *bp; int error; daddr_t lblkno; ASSERT_BO_LOCKED(bo); for (lblkno = startn;;) { again: bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); if (bp == NULL || bp->b_lblkno >= endn || bp->b_lblkno < startn) break; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); if (error != 0) { BO_RLOCK(bo); if (error == ENOLCK) goto again; return (error); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); lblkno = bp->b_lblkno + 1; if ((bp->b_flags & B_MANAGED) == 0) bremfree(bp); bp->b_flags |= B_RELBUF; /* * In the VMIO case, use the B_NOREUSE flag to hint that the * pages backing each buffer in the range are unlikely to be * reused. Dirty buffers will have the hint applied once * they've been written. */ if ((bp->b_flags & B_VMIO) != 0) bp->b_flags |= B_NOREUSE; brelse(bp); BO_RLOCK(bo); } return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, off_t length, int blksize) { struct buf *bp, *nbp; struct bufobj *bo; daddr_t startlbn; CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, vp, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ startlbn = howmany(length, blksize); ASSERT_VOP_LOCKED(vp, "vtruncbuf"); bo = &vp->v_bufobj; restart_unlocked: BO_LOCK(bo); while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) ; if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart_unlocked; VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * Invalidate the cached pages of a file's buffer within the range of block * numbers [startlbn, endlbn). */ void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize) { struct bufobj *bo; off_t start, end; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); start = blksize * startlbn; end = blksize * endlbn; bo = &vp->v_bufobj; BO_LOCK(bo); MPASS(blksize == bo->bo_bsize); while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) ; BO_UNLOCK(bo); vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); } static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn) { struct buf *bp, *nbp; bool anyfreed; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); ASSERT_BO_LOCKED(bo); do { anyfreed = false; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || nbp->b_vp != vp || (nbp->b_flags & B_DELWRI) != 0)) return (EAGAIN); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) return (EAGAIN); } } while (anyfreed); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; b_xflags_t flags; flags = bp->b_xflags; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), ("%s: buffer %p has invalid queue state", __func__, bp)); if ((flags & BX_VNDIRTY) != 0) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((bo->bo_flag & BO_NOBUFS) == 0, ("buf_vlist_add: bo %p does not allow bufs", bo)); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Look up a buffer using the buffer tries. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); } /* * Look up a buf using the buffer tries, without the bufobj lock. This relies * on SMR for safe lookup, and bufs being in a no-free zone to provide type * stability of the result. Like other lockless lookups, the found buf may * already be invalid by the time this function returns. */ struct buf * gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_UNLOCKED(bo); bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); buf_vlist_remove(bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = bo2vnode(*bo); if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining... "); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) { /* * Drop the sync mutex, because some watchdog * drivers need to sleep while patting */ mtx_unlock(&sync_mtx); wdog_kern_pat(WD_LASTVAL); mtx_lock(&sync_mtx); } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Move the buffer between the clean and dirty lists of its vnode. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; KASSERT((bp->b_flags & B_PAGING) == 0, ("%s: cannot reassign paging buffer %p", __func__, bp)); CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); BO_LOCK(bo); buf_vlist_remove(bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } static void v_init_counters(struct vnode *vp) { VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, vp, ("%s called for an initialized vnode", __FUNCTION__)); ASSERT_VI_UNLOCKED(vp, __FUNCTION__); refcount_init(&vp->v_holdcnt, 1); refcount_init(&vp->v_usecount, 1); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VIRF_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. * * usecount is manipulated using atomics without holding any locks. * * holdcnt can be manipulated using atomics without holding any locks, * except when transitioning 1<->0, in which case the interlock is held. * * Consumers which don't guarantee liveness of the vnode can use SMR to * try to get a reference. Note this operation can fail since the vnode * may be awaiting getting freed by the time they get to it. */ enum vgetstate vget_prep_smr(struct vnode *vp) { enum vgetstate vs; VFS_SMR_ASSERT_ENTERED(); if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { if (vhold_smr(vp)) vs = VGET_HOLDCNT; else vs = VGET_NONE; } return (vs); } enum vgetstate vget_prep(struct vnode *vp) { enum vgetstate vs; if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { vhold(vp); vs = VGET_HOLDCNT; } return (vs); } void vget_abort(struct vnode *vp, enum vgetstate vs) { switch (vs) { case VGET_USECOUNT: vrele(vp); break; case VGET_HOLDCNT: vdrop(vp); break; default: __assert_unreachable(); } } int vget(struct vnode *vp, int flags) { enum vgetstate vs; vs = vget_prep(vp); return (vget_finish(vp, flags, vs)); } int vget_finish(struct vnode *vp, int flags, enum vgetstate vs) { int error; if ((flags & LK_INTERLOCK) != 0) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); error = vn_lock(vp, flags); if (__predict_false(error != 0)) { vget_abort(vp, vs); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } vget_finish_ref(vp, vs); return (0); } void vget_finish_ref(struct vnode *vp, enum vgetstate vs) { int old; VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); if (vs == VGET_USECOUNT) return; /* * We hold the vnode. If the usecount is 0 it will be utilized to keep * the vnode around. Otherwise someone else lended their hold count and * we have to drop ours. */ old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); if (old != 0) { #ifdef INVARIANTS old = atomic_fetchadd_int(&vp->v_holdcnt, -1); VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); #else refcount_release(&vp->v_holdcnt); #endif } } void vref(struct vnode *vp) { enum vgetstate vs; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vs = vget_prep(vp); vget_finish_ref(vp, vs); } void vrefact(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); #else refcount_acquire(&vp->v_usecount); #endif } void vlazy(struct vnode *vp) { struct mount *mp; VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); if ((vp->v_mflag & VMP_LAZYLIST) != 0) return; /* * We may get here for inactive routines after the vnode got doomed. */ if (VN_IS_DOOMED(vp)) return; mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); if ((vp->v_mflag & VMP_LAZYLIST) == 0) { vp->v_mflag |= VMP_LAZYLIST; TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize++; } mtx_unlock(&mp->mnt_listmtx); } static void vunlazy(struct vnode *vp) { struct mount *mp; ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * Don't remove the vnode from the lazy list if another thread * has increased the hold count. It may have re-enqueued the * vnode to the lazy list and is now responsible for its * removal. */ if (vp->v_holdcnt == 0) { vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; } mtx_unlock(&mp->mnt_listmtx); } /* * This routine is only meant to be called from vgonel prior to dooming * the vnode. */ static void vunlazy_gone(struct vnode *vp) { struct mount *mp; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } } static void vdefer_inactive(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode without hold count", __func__)); if (VN_IS_DOOMED(vp)) { vdropl(vp); return; } if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vdropl(vp); return; } if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; vdropl(vp); return; } vlazy(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); counter_u64_add(deferred_inact, 1); } static void vdefer_inactive_unlocked(struct vnode *vp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } vdefer_inactive(vp); } enum vput_op { VRELE, VPUT, VUNREF }; /* * Handle ->v_usecount transitioning to 0. * * By releasing the last usecount we take ownership of the hold count which * provides liveness of the vnode, meaning we have to vdrop. * * For all vnodes we may need to perform inactive processing. It requires an * exclusive lock on the vnode, while it is legal to call here with only a * shared lock (or no locks). If locking the vnode in an expected manner fails, * inactive processing gets deferred to the syncer. * * XXX Some filesystems pass in an exclusively locked vnode and strongly depend * on the lock being held all the way until VOP_INACTIVE. This in particular * happens with UFS which adds half-constructed vnodes to the hash, where they * can be found by other code. */ static void vput_final(struct vnode *vp, enum vput_op func) { int error; bool want_unlock; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNPASS(vp->v_holdcnt > 0, vp); VI_LOCK(vp); /* * By the time we got here someone else might have transitioned * the count back to > 0. */ if (vp->v_usecount > 0) goto out; /* * If the vnode is doomed vgone already performed inactive processing * (if needed). */ if (VN_IS_DOOMED(vp)) goto out; if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) goto out; if (vp->v_iflag & VI_DOINGINACT) goto out; /* * Locking operations here will drop the interlock and possibly the * vnode lock, opening a window where the vnode can get doomed all the * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to * perform inactive. */ vp->v_iflag |= VI_OWEINACT; want_unlock = false; error = 0; switch (func) { case VRELE: switch (VOP_ISLOCKED(vp)) { case LK_EXCLUSIVE: break; case LK_EXCLOTHER: case 0: want_unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; default: /* * The lock has at least one sharer, but we have no way * to conclude whether this is us. Play it safe and * defer processing. */ error = EAGAIN; break; } break; case VPUT: want_unlock = true; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } if (error == 0) { if (func == VUNREF) { VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, ("recursive vunref")); vp->v_vflag |= VV_UNREF; } for (;;) { error = vinactive(vp); if (want_unlock) VOP_UNLOCK(vp); if (error != ERELOOKUP || !want_unlock) break; VOP_LOCK(vp, LK_EXCLUSIVE); } if (func == VUNREF) vp->v_vflag &= ~VV_UNREF; vdropl(vp); } else { vdefer_inactive(vp); } return; out: if (func == VPUT) VOP_UNLOCK(vp); vdropl(vp); } /* * Decrement ->v_usecount for a vnode. * * Releasing the last use count requires additional processing, see vput_final * above for details. * * Comment above each variant denotes lock state on entry and exit. */ /* * in: any * out: same as passed in */ void vrele(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VRELE); } /* * in: locked * out: unlocked */ void vput(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) { VOP_UNLOCK(vp); return; } vput_final(vp, VPUT); } /* * in: locked * out: locked */ void vunref(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VUNREF); } void vhold(struct vnode *vp) { int old; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); if (old == 0) vfs_freevnodes_dec(); } void vholdnz(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } /* * Grab a hold count unless the vnode is freed. * * Only use this routine if vfs smr is the only protection you have against * freeing the vnode. * * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag * is not set. After the flag is set the vnode becomes immutable to anyone but * the thread which managed to set the flag. * * It may be tempting to replace the loop with: * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); * if (count & VHOLD_NO_SMR) { * backpedal and error out; * } * * However, while this is more performant, it hinders debugging by eliminating * the previously mentioned invariant. */ bool vhold_smr(struct vnode *vp) { int count; VFS_SMR_ASSERT_ENTERED(); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { if (count == 0) vfs_freevnodes_dec(); return (true); } } } /* * Hold a free vnode for recycling. * * Note: vnode_init references this comment. * * Attempts to recycle only need the global vnode list lock and have no use for * SMR. * * However, vnodes get inserted into the global list before they get fully * initialized and stay there until UMA decides to free the memory. This in * particular means the target can be found before it becomes usable and after * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to * VHOLD_NO_SMR. * * Note: the vnode may gain more references after we transition the count 0->1. */ static bool vhold_recycle_free(struct vnode *vp) { int count; mtx_assert(&vnode_list_mtx, MA_OWNED); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (count > 0) { return (false); } if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { vfs_freevnodes_dec(); return (true); } } } static void __noinline vdbatch_process(struct vdbatch *vd) { struct vnode *vp; int i; mtx_assert(&vd->lock, MA_OWNED); MPASS(curthread->td_pinned > 0); MPASS(vd->index == VDBATCH_SIZE); mtx_lock(&vnode_list_mtx); critical_enter(); freevnodes += vd->freevnodes; for (i = 0; i < VDBATCH_SIZE; i++) { vp = vd->tab[i]; TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); MPASS(vp->v_dbatchcpu != NOCPU); vp->v_dbatchcpu = NOCPU; } mtx_unlock(&vnode_list_mtx); vd->freevnodes = 0; bzero(vd->tab, sizeof(vd->tab)); vd->index = 0; critical_exit(); } static void vdbatch_enqueue(struct vnode *vp) { struct vdbatch *vd; ASSERT_VI_LOCKED(vp, __func__); VNASSERT(!VN_IS_DOOMED(vp), vp, ("%s: deferring requeue of a doomed vnode", __func__)); if (vp->v_dbatchcpu != NOCPU) { VI_UNLOCK(vp); return; } sched_pin(); vd = DPCPU_PTR(vd); mtx_lock(&vd->lock); MPASS(vd->index < VDBATCH_SIZE); MPASS(vd->tab[vd->index] == NULL); /* * A hack: we depend on being pinned so that we know what to put in * ->v_dbatchcpu. */ vp->v_dbatchcpu = curcpu; vd->tab[vd->index] = vp; vd->index++; VI_UNLOCK(vp); if (vd->index == VDBATCH_SIZE) vdbatch_process(vd); mtx_unlock(&vd->lock); sched_unpin(); } /* * This routine must only be called for vnodes which are about to be * deallocated. Supporting dequeue for arbitrary vndoes would require * validating that the locked batch matches. */ static void vdbatch_dequeue(struct vnode *vp) { struct vdbatch *vd; int i; short cpu; VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, ("%s: called for a used vnode\n", __func__)); cpu = vp->v_dbatchcpu; if (cpu == NOCPU) return; vd = DPCPU_ID_PTR(cpu, vd); mtx_lock(&vd->lock); for (i = 0; i < vd->index; i++) { if (vd->tab[i] != vp) continue; vp->v_dbatchcpu = NOCPU; vd->index--; vd->tab[i] = vd->tab[vd->index]; vd->tab[vd->index] = NULL; break; } mtx_unlock(&vd->lock); /* * Either we dequeued the vnode above or the target CPU beat us to it. */ MPASS(vp->v_dbatchcpu == NOCPU); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VIRF_DOOMED) in which case we will free it. * * Because the vnode vm object keeps a hold reference on the vnode if * there is at least one resident non-cached page, the vnode cannot * leave the active list without the page cleanup done. */ static void __noinline vdropl_final(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(VN_IS_DOOMED(vp), vp); /* * Set the VHOLD_NO_SMR flag. * * We may be racing against vhold_smr. If they win we can just pretend * we never got this far, they will vdrop later. */ if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { vfs_freevnodes_inc(); VI_UNLOCK(vp); /* * We lost the aforementioned race. Any subsequent access is * invalid as they might have managed to vdropl on their own. */ return; } /* * Don't bump freevnodes as this one is going away. */ freevnode(vp); } void vdrop(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (refcount_release_if_not_last(&vp->v_holdcnt)) return; VI_LOCK(vp); vdropl(vp); } static void __always_inline vdropl_impl(struct vnode *vp, bool enqueue) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (!refcount_release(&vp->v_holdcnt)) { VI_UNLOCK(vp); return; } VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); if (VN_IS_DOOMED(vp)) { vdropl_final(vp); return; } vfs_freevnodes_inc(); if (vp->v_mflag & VMP_LAZYLIST) { vunlazy(vp); } if (!enqueue) { VI_UNLOCK(vp); return; } /* * Also unlocks the interlock. We can't assert on it as we * released our hold and by now the vnode might have been * freed. */ vdbatch_enqueue(vp); } void vdropl(struct vnode *vp) { vdropl_impl(vp, true); } /* * vdrop a vnode when recycling * * This is a special case routine only to be used when recycling, differs from * regular vdrop by not requeieing the vnode on LRU. * * Consider a case where vtryrecycle continuously fails with all vnodes (due to * e.g., frozen writes on the filesystem), filling the batch and causing it to * be requeued. Then vnlru will end up revisiting the same vnodes. This is a * loop which can last for as long as writes are frozen. */ static void vdropl_recycle(struct vnode *vp) { vdropl_impl(vp, false); } static void vdrop_recycle(struct vnode *vp) { VI_LOCK(vp); vdropl_recycle(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. */ static int vinactivef(struct vnode *vp) { struct vm_object *obj; int error; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are converted into the vnode's dirty * buffers, since these will no longer be checked once the * vnode is on the inactive list. * * The write-out of the dirty pages is asynchronous. At the * point that VOP_INACTIVE() is called, there could still be * pending I/O and dirty pages in the object. */ if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_WUNLOCK(obj); } error = VOP_INACTIVE(vp); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; return (error); } int vinactive(struct vnode *vp) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if ((vp->v_iflag & VI_OWEINACT) == 0) return (0); if (vp->v_iflag & VI_DOINGINACT) return (0); if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; return (0); } return (vinactivef(vp)); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } do { error = VOP_FSYNC(vp, MNT_WAIT, td); } while (error == ERELOOKUP); if (error != 0) { VOP_UNLOCK(vp); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount <= 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vn_printf(vp, "vflush: busy vnode "); #endif } VOP_UNLOCK(vp); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; VI_LOCK(vp); recycled = vrecyclel(vp); VI_UNLOCK(vp); return (recycled); } /* * vrecycle, with the vp interlock held. */ int vrecyclel(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) { struct mount *mp; struct mount_upper_node *ump; mp = atomic_load_ptr(&vp->v_mount); if (mp == NULL) return; if (TAILQ_EMPTY(&mp->mnt_notify)) return; MNT_ILOCK(mp); mp->mnt_upper_pending++; KASSERT(mp->mnt_upper_pending > 0, ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump->mp, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump->mp, vp); break; } MNT_ILOCK(mp); } mp->mnt_upper_pending--; if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && mp->mnt_upper_pending == 0) { mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; wakeup(&mp->mnt_uppers); } MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ static void vgonel(struct vnode *vp) { struct thread *td; struct mount *mp; vm_object_t object; bool active, doinginact, oweinact; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (VN_IS_DOOMED(vp)) return; /* * Paired with freevnode. */ vn_seqc_write_begin_locked(vp); vunlazy_gone(vp); vn_irflag_set_locked(vp, VIRF_DOOMED); /* * Check to see if the vnode is in use. If so, we have to * call VOP_CLOSE() and VOP_INACTIVE(). * * It could be that VOP_INACTIVE() requested reclamation, in * which case we should avoid recursion, so check * VI_DOINGINACT. This is not precise but good enough. */ active = vp->v_usecount > 0; oweinact = (vp->v_iflag & VI_OWEINACT) != 0; doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; /* * If we need to do inactive VI_OWEINACT will be set. */ if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vp->v_iflag &= ~VI_DEFINACT; vdropl(vp); } else { VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); VI_UNLOCK(vp); } cache_purge_vgone(vp); vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (!doinginact) { do { if (oweinact || active) { VI_LOCK(vp); vinactivef(vp); oweinact = (vp->v_iflag & VI_OWEINACT) != 0; VI_UNLOCK(vp); } } while (oweinact); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); /* * For VMIO bufobj, BO_DEAD is set later, or in * vm_object_terminate() after the object's page queue is * flushed. */ object = vp->v_bufobj.bo_object; if (object == NULL) vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); /* * Handle the VM part. Tmpfs handles v_object on its own (the * OBJT_VNODE check). Nullfs or other bypassing filesystems * should not touch the object borrowed from the lower vnode * (the handle check). */ if (object != NULL && object->type == OBJT_VNODE && object->handle == vp) vnode_destroy_vobject(vp); /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p", vp)); /* * Clear the advisory locks and wake up waiting threads. */ if (vp->v_lockf != NULL) { (void)VOP_ADVLOCKPURGE(vp); vp->v_lockf = NULL; } /* * Delete from old mount point vnode list. */ if (vp->v_mount == NULL) { VI_LOCK(vp); } else { delmntque(vp); ASSERT_VI_LOCKED(vp, "vgonel 2"); } /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_type = VBAD; } /* * Print out a description of a vnode. */ static const char * const typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, "new hold count flag not added to vn_printf"); void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; u_int holdcnt; short irflag; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("type %s\n", typename[vp->v_type]); holdcnt = atomic_load_int(&vp->v_holdcnt); printf(" usecount %d, writecount %d, refcount %d seqc users %d", vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, vp->v_seqc_users); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); break; case VCHR: printf(" rdev %p\n", vp->v_rdev); break; case VSOCK: printf(" socket %p\n", vp->v_unpcb); break; case VFIFO: printf(" fifoinfo %p\n", vp->v_fifoinfo); break; default: printf("\n"); break; } buf[0] = '\0'; buf[1] = '\0'; if (holdcnt & VHOLD_NO_SMR) strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); printf(" hold count flags (%s)\n", buf + 1); buf[0] = '\0'; buf[1] = '\0'; irflag = vn_irflag_read(vp); if (irflag & VIRF_DOOMED) strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); if (irflag & VIRF_PGREAD) strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); if (irflag & VIRF_MOUNTPOINT) strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); if (irflag & VIRF_TEXT_REF) strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_VMSIZEVNLOCK) strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); if (vp->v_vflag & VV_READLINK) strlcat(buf, "|VV_READLINK", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); if (vp->v_iflag & VI_DEFINACT) strlcat(buf, "|VI_DEFINACT", sizeof(buf)); if (vp->v_iflag & VI_FOPENING) strlcat(buf, "|VI_FOPENING", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | VI_OWEINACT | VI_DEFINACT | VI_FOPENING); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_mflag & VMP_LAZYLIST) strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); flags = vp->v_mflag & ~(VMP_LAZYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); printf("\n"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_clean.bv_cnt, vp->v_bufobj.bo_dirty.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vn_printf(vp, "vnode "); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_NOMSYNC); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_RECURSE); MNT_KERN_FLAG(MNTK_UPPER_WAITER); MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); MNT_KERN_FLAG(MNTK_FPLOOKUP); MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_NULL_NOCACHE); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_lazyvnodelistsize = %d\n", mp->mnt_lazyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_lockref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif static void unmount_or_warn(struct mount *mp) { int error; error = dounmount(mp, MNT_FORCE, curthread); if (error != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp, *tmp; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); /* * Since this only runs when rebooting, it is not interlocked. */ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { vfs_ref(mp); /* * Forcibly unmounting "/dev" before "/" would prevent clean * unmount of the latter. */ if (mp == rootdevmp) continue; unmount_or_warn(mp); } if (rootdevmp != NULL) unmount_or_warn(rootdevmp); } static void vfs_deferred_inactive(struct vnode *vp, int lkflags) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } if (vn_lock(vp, lkflags) == 0) { VI_LOCK(vp); vinactive(vp); VOP_UNLOCK(vp); vdropl(vp); return; } vdefer_inactive_unlocked(vp); } static int vfs_periodic_inactive_filter(struct vnode *vp, void *arg) { return (vp->v_iflag & VI_DEFINACT); } static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; int lkflags; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; } vp->v_iflag &= ~VI_DEFINACT; vfs_deferred_inactive(vp, lkflags); } } static inline bool vfs_want_msync(struct vnode *vp) { struct vm_object *obj; /* * This test may be performed without any locks held. * We rely on vm_object's type stability. */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } static int vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) { if (vp->v_vflag & VV_NOSYNC) return (false); if (vp->v_iflag & VI_DEFINACT) return (true); return (vfs_want_msync(vp)); } static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; int lkflags, objflags; bool seen_defer; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) { lkflags |= LK_NOWAIT; objflags = OBJPC_NOSYNC; } else { objflags = OBJPC_SYNC; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; seen_defer = true; } if (!vfs_want_msync(vp)) { if (seen_defer) vfs_deferred_inactive(vp, lkflags); else VI_UNLOCK(vp); continue; } if (vget(vp, lkflags) == 0) { obj = vp->v_object; if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, objflags); VM_OBJECT_WUNLOCK(obj); } vput(vp); if (seen_defer) vdrop(vp); } else { if (seen_defer) vdefer_inactive_unlocked(vp); } } } void vfs_periodic(struct mount *mp, int flags) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) vfs_periodic_inactive(mp, flags); else vfs_periodic_msync_inactive(mp, flags); } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); free(vi, M_VNODEPOLL); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initialize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_lock); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; VFS_VOP_VECTOR_REGISTER(sync_vnodeops); /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque1(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); VOP_UNLOCK(syncvp); save = curthread_pflags_set(TDP_SYNCIO); /* * The filesystem at hand may be idle with free vnodes stored in the * batch. Return them instead of letting them stay there indefinitely. */ vfs_periodic(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } int vn_need_pageq_flush(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)); } /* * Check if vnode represents a disk device */ bool vn_isdisk_error(struct vnode *vp, int *errp) { int error; if (vp->v_type != VCHR) { error = ENOTBLK; goto out; } error = 0; dev_lock(); if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); out: *errp = error; return (error == 0); } bool vn_isdisk(struct vnode *vp) { int error; return (vn_isdisk_error(vp, &error)); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) { int error; VFS_SMR_ASSERT_ENTERED(); /* Check the owner. */ if (cred->cr_uid == file_uid) { if (file_mode & S_IXUSR) return (0); goto out_error; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) return (0); goto out_error; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) return (0); out_error: /* * Permission check failed, but it is possible denial will get overwritten * (e.g., when root is traversing through a 700 directory owned by someone * else). * * vaccess() calls priv_check_cred which in turn can descent into MAC * modules overriding this result. It's quite unclear what semantics * are allowed for them to operate, thus for safety we don't call them * from within the SMR section. This also means if any such modules * are present, we have to let the regular lookup decide. */ error = priv_check_cred_vfs_lookup_nomac(cred); switch (error) { case 0: return (0); case EAGAIN: /* * MAC modules present. */ return (EAGAIN); case EPERM: return (EACCES); default: return (error); } } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, and credentials. * Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 0, "Print vnode details on lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_vnode) vn_printf(vp, "vnode "); if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { int locked; if (KERNEL_PANICKED() || vp == NULL) return; locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) vfs_badlock("is not locked but should be", str, vp); } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (KERNEL_PANICKED() || vp == NULL) return; if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (KERNEL_PANICKED() || vp == NULL) return; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif /* * It may be tempting to add vn_seqc_write_begin/end calls here and * in vop_rename_post but that's not going to work out since some * filesystems relookup vnodes mid-rename. This is probably a bug. * * For now filesystems are expected to do the relevant calls after they * decide what vnodes to operate on. */ if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_symlink_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } static void vop_fsync_debugprepost(struct vnode *vp, const char *name) { if (vp->v_type == VCHR) ; else if (MNT_EXTENDED_SHARED(vp->v_mount)) ASSERT_VOP_LOCKED(vp, name); else ASSERT_VOP_ELOCKED(vp, name); } void vop_fsync_debugpre(void *a) { struct vop_fsync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fsync_debugpost(void *a, int rc __unused) { struct vop_fsync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fdatasync_debugpre(void *a) { struct vop_fdatasync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fdatasync_debugpost(void *a, int rc __unused) { struct vop_fdatasync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_strategy_debugpre(void *ap) { struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } } void vop_lock_debugpre(void *ap) { struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_lock_debugpost(void *ap, int rc) { struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_unlock_debugpre(void *ap) { struct vop_unlock_args *a = ap; ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); } void vop_need_inactive_debugpre(void *ap) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } void vop_need_inactive_debugpost(void *ap, int rc) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } #endif void vop_create_pre(void *ap) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_create_post(void *ap, int rc) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_whiteout_pre(void *ap) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_whiteout_post(void *ap, int rc) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); } void vop_deleteextattr_pre(void *ap) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_pre(void *ap) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_begin(vp); vn_seqc_write_begin(tdvp); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_end(vp); vn_seqc_write_end(tdvp); if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_LINK); VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); } } void vop_mkdir_pre(void *ap) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); } #ifdef DEBUG_VFS_LOCKS void vop_mkdir_debugpost(void *ap, int rc) { struct vop_mkdir_args *a; a = ap; if (!rc) cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); } #endif void vop_mknod_pre(void *ap) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_reclaim_post(void *ap, int rc) { struct vop_reclaim_args *a; struct vnode *vp; a = ap; vp = a->a_vp; ASSERT_VOP_IN_SEQC(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); } void vop_remove_pre(void *ap) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; long hint; if (!rc) { hint = NOTE_WRITE; if (a->a_fdvp == a->a_tdvp) { if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } else { hint |= NOTE_EXTEND; if (a->a_fvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint &= ~NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_pre(void *ap) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { vp->v_vflag |= VV_UNLINKED; VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_setattr_pre(void *ap) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_setacl_pre(void *ap) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setacl_post(void *ap, int rc __unused) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); } void vop_setextattr_pre(void *ap) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_symlink_pre(void *ap) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_open_post(void *ap, int rc) { struct vop_open_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); } void vop_close_post(void *ap, int rc) { struct vop_close_args *a = ap; if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ !VN_IS_DOOMED(a->a_vp))) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); } } void vop_read_post(void *ap, int rc) { struct vop_read_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } void vop_read_pgcache_post(void *ap, int rc) { struct vop_read_pgcache_args *a = ap; if (!rc) VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); } void vop_readdir_post(void *ap, int rc) { struct vop_readdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= kn->kn_sfflags & hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; static struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; static struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp); } static void vfs_knl_assert_lock(void *arg, int what) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; if (what == LA_LOCKED) ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); else ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && kn->kn_filter != EVFILT_WRITE), ("READ/WRITE filter on a FIFO leaked through")); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } /* * Returns whether the directory is empty or not. * If it is empty, the return value is 0; otherwise * the return value is an error value (which may * be ENOTEMPTY). */ int vfs_emptydir(struct vnode *vp) { struct uio uio; struct iovec iov; struct dirent *dirent, *dp, *endp; int error, eof; error = 0; eof = 0; ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); VNASSERT(vp->v_type == VDIR, vp, ("vp is not a directory")); dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); iov.iov_base = dirent; iov.iov_len = sizeof(struct dirent); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = sizeof(struct dirent); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; while (eof == 0 && error == 0) { error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, NULL, NULL); if (error != 0) break; endp = (void *)((uint8_t *)dirent + sizeof(struct dirent) - uio.uio_resid); for (dp = dirent; dp < endp; dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { if (dp->d_type == DT_WHT) continue; if (dp->d_namlen == 0) continue; if (dp->d_type != DT_DIR && dp->d_type != DT_UNKNOWN) { error = ENOTEMPTY; break; } if (dp->d_namlen > 2) { error = ENOTEMPTY; break; } if (dp->d_namlen == 1 && dp->d_name[0] != '.') { error = ENOTEMPTY; break; } if (dp->d_namlen == 2 && dp->d_name[1] != '.') { error = ENOTEMPTY; break; } uio.uio_resid = sizeof(struct dirent); } } free(dirent, M_TEMP); return (error); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; *ap->a_ncookies += 1; return (0); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * Clear out a doomed vnode (if any) and replace it with a new one as long * as the fs is not being unmounted. Return the root vnode to the caller. */ static int __noinline vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; restart: if (mp->mnt_rootvnode != NULL) { MNT_ILOCK(mp); vp = mp->mnt_rootvnode; if (vp != NULL) { if (!VN_IS_DOOMED(vp)) { vrefact(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, flags); if (error == 0) { *vpp = vp; return (0); } vrele(vp); goto restart; } /* * Clear the old one. */ mp->mnt_rootvnode = NULL; } MNT_IUNLOCK(mp); if (vp != NULL) { vfs_op_barrier_wait(mp); vrele(vp); } } error = VFS_CACHEDROOT(mp, flags, vpp); if (error != 0) return (error); if (mp->mnt_vfs_ops == 0) { MNT_ILOCK(mp); if (mp->mnt_vfs_ops != 0) { MNT_IUNLOCK(mp); return (0); } if (mp->mnt_rootvnode == NULL) { vrefact(*vpp); mp->mnt_rootvnode = *vpp; } else { if (mp->mnt_rootvnode != *vpp) { if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { panic("%s: mismatch between vnode returned " " by VFS_CACHEDROOT and the one cached " " (%p != %p)", __func__, *vpp, mp->mnt_rootvnode); } } } MNT_IUNLOCK(mp); } return (0); } int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) { struct mount_pcpu *mpcpu; struct vnode *vp; int error; if (!vfs_op_thread_enter(mp, mpcpu)) return (vfs_cache_root_fallback(mp, flags, vpp)); vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL || VN_IS_DOOMED(vp)) { vfs_op_thread_exit(mp, mpcpu); return (vfs_cache_root_fallback(mp, flags, vpp)); } vrefact(vp); vfs_op_thread_exit(mp, mpcpu); error = vn_lock(vp, flags); if (error != 0) { vrele(vp); return (vfs_cache_root_fallback(mp, flags, vpp)); } *vpp = vp; return (0); } struct vnode * vfs_cache_root_clear(struct mount *mp) { struct vnode *vp; /* * ops > 0 guarantees there is nobody who can see this vnode */ MPASS(mp->mnt_vfs_ops > 0); vp = mp->mnt_rootvnode; if (vp != NULL) vn_seqc_write_begin(vp); mp->mnt_rootvnode = NULL; return (vp); } void vfs_cache_root_set(struct mount *mp, struct vnode *vp) { MPASS(mp->mnt_vfs_ops > 0); vrefact(vp); mp->mnt_rootvnode = vp; } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; if (should_yield()) kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; vp = TAILQ_NEXT(vp, v_nmntvnodes)) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h */ static void mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * Relock the mp mount vnode list lock with the vp vnode interlock in the * conventional lock order during mnt_vnode_next_lazy iteration. * * On entry, the mount vnode list lock is held and the vnode interlock is not. * The list lock is dropped and reacquired. On success, both locks are held. * On failure, the mount vnode list lock is held but the vnode interlock is * not, and the procedure may have yielded. */ static bool mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, struct vnode *vp) { VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, ("%s: bad marker", __func__)); VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, ("%s: inappropriate vnode", __func__)); ASSERT_VI_UNLOCKED(vp, __func__); mtx_assert(&mp->mnt_listmtx, MA_OWNED); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); /* * Note we may be racing against vdrop which transitioned the hold * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, * if we are the only user after we get the interlock we will just * vdrop. */ vhold(vp); mtx_unlock(&mp->mnt_listmtx); VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); goto out_lost; } VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * There is nothing to do if we are the last user. */ if (!refcount_release_if_not_last(&vp->v_holdcnt)) goto out_lost; mtx_lock(&mp->mnt_listmtx); return (true); out_lost: vdropl(vp); maybe_yield(); mtx_lock(&mp->mnt_listmtx); return (false); } static struct vnode * mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_lazylist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } /* * See if we want to process the vnode. Note we may encounter a * long string of vnodes we don't care about and hog the list * as a result. Check for it and requeue the marker. */ VNPASS(!VN_IS_DOOMED(vp), vp); if (!cb(vp, cbarg)) { if (!should_yield()) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); goto restart; } /* * Try-lock because this is the wrong lock order. */ if (!VI_TRYLOCK(vp) && !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) goto restart; KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the lazy list %p %p", vp, mp)); VNPASS(vp->v_mount == mp, vp); VNPASS(!VN_IS_DOOMED(vp), vp); break; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); /* Check if we are done */ if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); ASSERT_VI_LOCKED(vp, "lazy iter"); return (vp); } struct vnode * __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } struct vnode * __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) return (NULL); *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); mtx_lock(&mp->mnt_listmtx); vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); } int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) { if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; return (0); } return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); } /* * Do not use this variant unless you have means other than the hold count * to prevent the vnode from getting freed. */ void vn_seqc_write_begin_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vp->v_seqc_users >= 0, vp); vp->v_seqc_users++; if (vp->v_seqc_users == 1) seqc_sleepable_write_begin(&vp->v_seqc); } void vn_seqc_write_begin(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_end_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users > 0, vp); vp->v_seqc_users--; if (vp->v_seqc_users == 0) seqc_sleepable_write_end(&vp->v_seqc); } void vn_seqc_write_end(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_end_locked(vp); VI_UNLOCK(vp); } /* * Special case handling for allocating and freeing vnodes. * * The counter remains unchanged on free so that a doomed vnode will * keep testing as in modify as long as it is accessible with SMR. */ static void vn_seqc_init(struct vnode *vp) { vp->v_seqc = 0; vp->v_seqc_users = 0; } static void vn_seqc_write_end_free(struct vnode *vp) { VNPASS(seqc_in_modify(vp->v_seqc), vp); VNPASS(vp->v_seqc_users == 1, vp); } void vn_irflag_set_locked(struct vnode *vp, short toset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); VNASSERT((flags & toset) == 0, vp, ("%s: some of the passed flags already set (have %d, passed %d)\n", __func__, flags, toset)); atomic_store_short(&vp->v_irflag, flags | toset); } void vn_irflag_set(struct vnode *vp, short toset) { VI_LOCK(vp); vn_irflag_set_locked(vp, toset); VI_UNLOCK(vp); } void vn_irflag_set_cond_locked(struct vnode *vp, short toset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); atomic_store_short(&vp->v_irflag, flags | toset); } void vn_irflag_set_cond(struct vnode *vp, short toset) { VI_LOCK(vp); vn_irflag_set_cond_locked(vp, toset); VI_UNLOCK(vp); } void vn_irflag_unset_locked(struct vnode *vp, short tounset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); VNASSERT((flags & tounset) == tounset, vp, ("%s: some of the passed flags not set (have %d, passed %d)\n", __func__, flags, tounset)); atomic_store_short(&vp->v_irflag, flags & ~tounset); } void vn_irflag_unset(struct vnode *vp, short tounset) { VI_LOCK(vp); vn_irflag_unset_locked(vp, tounset); VI_UNLOCK(vp); } diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 006ee24cac25..103f4dab519f 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,5035 +1,5035 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #ifdef COMPAT_FREEBSD11 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information"); static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag); static int setfflags(struct thread *td, struct vnode *, u_long); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int getutimens(const struct timespec *, enum uio_seg, struct timespec *, int *); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); static int kern_fhlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp); static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, struct thread *td); static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, enum uio_seg segflag); static uint64_t at2cnpflags(u_int at_flags, u_int mask) { uint64_t res; MPASS((at_flags & (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)) != (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)); res = 0; at_flags &= mask; if ((at_flags & AT_RESOLVE_BENEATH) != 0) res |= RBENEATH; if ((at_flags & AT_SYMLINK_FOLLOW) != 0) res |= FOLLOW; /* NOFOLLOW is pseudo flag */ if ((mask & AT_SYMLINK_NOFOLLOW) != 0) { res |= (at_flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW : FOLLOW; } if ((mask & AT_EMPTY_PATH) != 0 && (at_flags & AT_EMPTY_PATH) != 0) res |= EMPTYPATH; return (res); } int kern_sync(struct thread *td) { struct mount *mp, *nmp; int save; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); vfs_periodic(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); return (0); } /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif /* ARGSUSED */ int sys_sync(struct thread *td, struct sync_args *uap) { return (kern_sync(td)); } /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif int sys_quotactl(struct thread *td, struct quotactl_args *uap) { struct mount *mp; struct nameidata nd; int error; bool mp_busy; AUDIT_ARG_CMD(uap->cmd); AUDIT_ARG_UID(uap->uid); if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS)) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path); if ((error = namei(&nd)) != 0) return (error); NDFREE_PNBUF(&nd); mp = nd.ni_vp->v_mount; vfs_ref(mp); vput(nd.ni_vp); error = vfs_busy(mp, 0); if (error != 0) { vfs_rel(mp); return (error); } mp_busy = true; error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, &mp_busy); /* * Since quota on/off operations typically need to open quota * files, the implementation may need to unbusy the mount point * before calling into namei. Otherwise, unmount might be * started between two vfs_busy() invocations (first is ours, * second is from mount point cross-walk code in lookup()), * causing deadlock. * * Avoid unbusying mp if the implementation indicates it has * already done so. */ if (mp_busy) vfs_unbusy(mp); vfs_rel(mp); return (error); } /* * Used by statfs conversion routines to scale the block size up if * necessary so that all of the block counts are <= 'max_size'. Note * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero * value of 'n'. */ void statfs_scale_blocks(struct statfs *sf, long max_size) { uint64_t count; int shift; KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__)); /* * Attempt to scale the block counts to give a more accurate * overview to userland of the ratio of free space to used * space. To do this, find the largest block count and compute * a divisor that lets it fit into a signed integer <= max_size. */ if (sf->f_bavail < 0) count = -sf->f_bavail; else count = sf->f_bavail; count = MAX(sf->f_blocks, MAX(sf->f_bfree, count)); if (count <= max_size) return; count >>= flsl(max_size); shift = 0; while (count > 0) { shift++; count >>=1; } sf->f_bsize <<= shift; sf->f_blocks >>= shift; sf->f_bfree >>= shift; sf->f_bavail >>= shift; } static int kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf) { int error; if (mp == NULL) return (EBADF); error = vfs_busy(mp, 0); vfs_rel(mp); if (error != 0) return (error); #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif error = VFS_STATFS(mp, buf); if (error != 0) goto out; if (priv_check_cred_vfs_generation(td->td_ucred)) { buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, buf); } out: vfs_unbusy(mp); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif int sys_statfs(struct thread *td, struct statfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_statfs(struct thread *td, const char *path, enum uio_seg pathseg, struct statfs *buf) { struct mount *mp; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path); error = namei(&nd); if (error != 0) return (error); + NDFREE_PNBUF(&nd); mp = vfs_ref_from_vp(nd.ni_vp); - NDFREE_NOTHING(&nd); vrele(nd.ni_vp); return (kern_do_statfs(td, mp, buf)); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif int sys_fstatfs(struct thread *td, struct fstatfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_fstatfs(struct thread *td, int fd, struct statfs *buf) { struct file *fp; struct mount *mp; struct vnode *vp; int error; AUDIT_ARG_FD(fd); error = getvnode_path(td, fd, &cap_fstatfs_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp); } #endif mp = vfs_ref_from_vp(vp); fdrop(fp, td); return (kern_do_statfs(td, mp, buf)); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int mode; }; #endif int sys_getfsstat(struct thread *td, struct getfsstat_args *uap) { size_t count; int error; if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX) return (EINVAL); error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count, UIO_USERSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; return (error); } /* * If (bufsize > 0 && bufseg == UIO_SYSSPACE) * The caller is responsible for freeing memory which will be allocated * in '*buf'. */ int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, size_t *countp, enum uio_seg bufseg, int mode) { struct mount *mp, *nmp; struct statfs *sfsp, *sp, *sptmp, *tofree; size_t count, maxcount; int error; switch (mode) { case MNT_WAIT: case MNT_NOWAIT: break; default: if (bufseg == UIO_SYSSPACE) *buf = NULL; return (EINVAL); } restart: maxcount = bufsize / sizeof(struct statfs); if (bufsize == 0) { sfsp = NULL; tofree = NULL; } else if (bufseg == UIO_USERSPACE) { sfsp = *buf; tofree = NULL; } else /* if (bufseg == UIO_SYSSPACE) */ { count = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { count++; } mtx_unlock(&mountlist_mtx); if (maxcount > count) maxcount = count; tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_STATFS, M_WAITOK); } count = 0; /* * If there is no target buffer they only want the count. * * This could be TAILQ_FOREACH but it is open-coded to match the original * code below. */ if (sfsp == NULL) { mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif count++; nmp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); *countp = count; return (0); } /* * They want the entire thing. * * Short-circuit the corner case of no room for anything, avoids * relocking below. */ if (maxcount < 1) { goto out; } mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (mode == MNT_WAIT) { if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) { /* * If vfs_busy() failed, and MBF_NOWAIT * wasn't passed, then the mp is gone. * Furthermore, because of MBF_MNTLSTLOCK, * the mountlist_mtx was dropped. We have * no other choice than to start over. */ mtx_unlock(&mountlist_mtx); free(tofree, M_STATFS); goto restart; } } else { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } } sp = &mp->mnt_stat; /* * If MNT_NOWAIT is specified, do not refresh * the fsstat cache. */ if (mode != MNT_NOWAIT) { error = VFS_STATFS(mp, sp); if (error != 0) { mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); continue; } } if (priv_check_cred_vfs_generation(td->td_ucred)) { sptmp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); *sptmp = *sp; sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, sptmp); sp = sptmp; } else sptmp = NULL; if (bufseg == UIO_SYSSPACE) { bcopy(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); } else /* if (bufseg == UIO_USERSPACE) */ { error = copyout(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); if (error != 0) { vfs_unbusy(mp); return (error); } } sfsp++; count++; if (count == maxcount) { vfs_unbusy(mp); goto out; } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); out: *countp = count; return (0); } #ifdef COMPAT_FREEBSD4 /* * Get old format filesystem statistics. */ static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *); #ifndef _SYS_SYSPROTO_H_ struct freebsd4_statfs_args { char *path; struct ostatfs *buf; }; #endif int freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fstatfs_args { int fd; struct ostatfs *buf; }; #endif int freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_getfsstat_args { struct ostatfs *buf; long bufsize; int mode; }; #endif int freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap) { struct statfs *buf, *sp; struct ostatfs osb; size_t count, size; int error; if (uap->bufsize < 0) return (EINVAL); count = uap->bufsize / sizeof(struct ostatfs); if (count > SIZE_MAX / sizeof(struct statfs)) return (EINVAL); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size != 0) { sp = buf; while (count != 0 && error == 0) { freebsd4_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fhstatfs_args { struct fhandle *u_fhp; struct ostatfs *buf; }; #endif int freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp) { statfs_scale_blocks(nsp, LONG_MAX); bzero(osp, sizeof(*osp)); osp->f_bsize = nsp->f_bsize; osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = MIN(nsp->f_files, LONG_MAX); osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); osp->f_owner = nsp->f_owner; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, OMFSNAMELEN)); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, OMNAMELEN)); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, OMNAMELEN)); osp->f_fsid = nsp->f_fsid; } #endif /* COMPAT_FREEBSD4 */ #if defined(COMPAT_FREEBSD11) /* * Get old format filesystem statistics. */ static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *); int freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ int freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ int freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap) { return (kern_freebsd11_getfsstat(td, uap->buf, uap->bufsize, uap->mode)); } int kern_freebsd11_getfsstat(struct thread *td, struct freebsd11_statfs * ubuf, long bufsize, int mode) { struct freebsd11_statfs osb; struct statfs *buf, *sp; size_t count, size; int error; if (bufsize < 0) return (EINVAL); count = bufsize / sizeof(struct ostatfs); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, mode); if (error == 0) td->td_retval[0] = count; if (size > 0) { sp = buf; while (count > 0 && error == 0) { freebsd11_cvtstatfs(sp, &osb); error = copyout(&osb, ubuf, sizeof(osb)); sp++; ubuf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ int freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp) { bzero(osp, sizeof(*osp)); osp->f_version = FREEBSD11_STATFS_VERSION; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_bsize = nsp->f_bsize; osp->f_iosize = nsp->f_iosize; osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = nsp->f_files; osp->f_ffree = nsp->f_ffree; osp->f_syncwrites = nsp->f_syncwrites; osp->f_asyncwrites = nsp->f_asyncwrites; osp->f_syncreads = nsp->f_syncreads; osp->f_asyncreads = nsp->f_asyncreads; osp->f_namemax = nsp->f_namemax; osp->f_owner = nsp->f_owner; osp->f_fsid = nsp->f_fsid; strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, sizeof(osp->f_fstypename))); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, sizeof(osp->f_mntonname))); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, sizeof(osp->f_mntfromname))); } #endif /* COMPAT_FREEBSD11 */ /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif int sys_fchdir(struct thread *td, struct fchdir_args *uap) { struct vnode *vp, *tdp; struct mount *mp; struct file *fp; int error; AUDIT_ARG_FD(uap->fd); error = getvnode_path(td, uap->fd, &cap_fchdir_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; vrefact(vp); fdrop(fp, td); vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = change_dir(vp, td); while (!error && (mp = vp->v_mountedhere) != NULL) { if (vfs_busy(mp, 0)) continue; error = VFS_ROOT(mp, LK_SHARED, &tdp); vfs_unbusy(mp); if (error != 0) break; vput(vp); vp = tdp; } if (error != 0) { vput(vp); return (error); } VOP_UNLOCK(vp); pwd_chdir(td, vp); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif int sys_chdir(struct thread *td, struct chdir_args *uap) { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, const char *path, enum uio_seg pathseg) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path); if ((error = namei(&nd)) != 0) return (error); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); return (error); } VOP_UNLOCK(nd.ni_vp); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); pwd_chdir(td, nd.ni_vp); return (0); } static int unprivileged_chroot = 0; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_chroot, CTLFLAG_RW, &unprivileged_chroot, 0, "Unprivileged processes can use chroot(2)"); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif int sys_chroot(struct thread *td, struct chroot_args *uap) { struct nameidata nd; struct proc *p; int error; error = priv_check(td, PRIV_VFS_CHROOT); if (error != 0) { p = td->td_proc; PROC_LOCK(p); if (unprivileged_chroot == 0 || (p->p_flag2 & P2_NO_NEW_PRIVS) == 0) { PROC_UNLOCK(p); return (error); } PROC_UNLOCK(p); } NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path); error = namei(&nd); if (error != 0) return (error); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); error = change_dir(nd.ni_vp, td); if (error != 0) goto e_vunlock; #ifdef MAC error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp); if (error != 0) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp); error = pwd_chroot(td, nd.ni_vp); vrele(nd.ni_vp); return (error); e_vunlock: vput(nd.ni_vp); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(struct vnode *vp, struct thread *td) { #ifdef MAC int error; #endif ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_vnode_check_chdir(td->td_ucred, vp); if (error != 0) return (error); #endif return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td)); } static __inline void flags_to_rights(int flags, cap_rights_t *rightsp) { if (flags & O_EXEC) { cap_rights_set_one(rightsp, CAP_FEXECVE); if (flags & O_PATH) return; } else { switch ((flags & O_ACCMODE)) { case O_RDONLY: cap_rights_set_one(rightsp, CAP_READ); break; case O_RDWR: cap_rights_set_one(rightsp, CAP_READ); /* FALLTHROUGH */ case O_WRONLY: cap_rights_set_one(rightsp, CAP_WRITE); if (!(flags & (O_APPEND | O_TRUNC))) cap_rights_set_one(rightsp, CAP_SEEK); break; } } if (flags & O_CREAT) cap_rights_set_one(rightsp, CAP_CREATE); if (flags & O_TRUNC) cap_rights_set_one(rightsp, CAP_FTRUNCATE); if (flags & (O_SYNC | O_FSYNC)) cap_rights_set_one(rightsp, CAP_FSYNC); if (flags & (O_EXLOCK | O_SHLOCK)) cap_rights_set_one(rightsp, CAP_FLOCK); } /* * Check permissions, allocate an open file structure, and call the device * open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int sys_open(struct thread *td, struct open_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct openat_args { int fd; char *path; int flag; int mode; }; #endif int sys_openat(struct thread *td, struct openat_args *uap) { AUDIT_ARG_FD(uap->fd); return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->mode)); } int kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp; struct pwddesc *pdp; struct file *fp; struct vnode *vp; struct nameidata nd; cap_rights_t rights; int cmode, error, indx; indx = -1; fdp = p->p_fd; pdp = p->p_pd; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); cap_rights_init_one(&rights, CAP_LOOKUP); flags_to_rights(flags, &rights); /* * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags * may be specified. On the other hand, for O_PATH any mode * except O_EXEC is ignored. */ if ((flags & O_PATH) != 0) { flags &= ~(O_CREAT | O_ACCMODE); } else if ((flags & O_EXEC) != 0) { if (flags & O_ACCMODE) return (EINVAL); } else if ((flags & O_ACCMODE) == O_ACCMODE) { return (EINVAL); } else { flags = FFLAGS(flags); } /* * Allocate a file structure. The descriptor to reference it * is allocated and used by finstall_refed() below. */ error = falloc_noinstall(td, &fp); if (error != 0) return (error); /* Set the flags early so the finit in devfs can pick them up. */ fp->f_flag = flags & FMASK; cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT; NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | WANTIOCTLCAPS, pathseg, path, fd, &rights); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open_cred(&nd, &flags, cmode, VN_OPEN_WANTIOCTLCAPS, td->td_ucred, fp); if (error != 0) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) { MPASS((flags & O_PATH) == 0); goto success; } /* * Handle special fdopen() case. bleh. * * Don't do this for relative (capability) lookups; we don't * understand exactly what would happen, and we don't think * that it ever should. */ if ((nd.ni_resflags & NIRES_STRICTREL) == 0 && (error == ENODEV || error == ENXIO) && td->td_dupfd >= 0) { error = dupfdopen(td, fdp, td->td_dupfd, flags, error, &indx); if (error == 0) goto success; } goto bad; } td->td_dupfd = 0; NDFREE_PNBUF(&nd); vp = nd.ni_vp; /* * Store the vnode, for any f_type. Typically, the vnode use * count is decremented by direct call to vn_closefile() for * files that switched type in the cdevsw fdopen() method. */ fp->f_vnode = vp; /* * If the file wasn't claimed by devfs bind it to the normal * vnode operations here. */ if (fp->f_ops == &badfileops) { KASSERT(vp->v_type != VFIFO || (flags & O_PATH) != 0, ("Unexpected fifo fp %p vp %p", fp, vp)); if ((flags & O_PATH) != 0) { finit(fp, (flags & FMASK) | (fp->f_flag & FKQALLOWED), DTYPE_VNODE, NULL, &path_fileops); } else { finit_vnode(fp, flags, NULL, &vnops); } } VOP_UNLOCK(vp); if (flags & O_TRUNC) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } success: /* * If we haven't already installed the FD (for dupfdopen), do so now. */ if (indx == -1) { struct filecaps *fcaps; #ifdef CAPABILITIES if ((nd.ni_resflags & NIRES_STRICTREL) != 0) fcaps = &nd.ni_filecaps; else #endif fcaps = NULL; error = finstall_refed(td, fp, &indx, flags, fcaps); /* On success finstall_refed() consumes fcaps. */ if (error != 0) { goto bad; } } else { NDFREE_IOCTLCAPS(&nd); falloc_abort(td, fp); } td->td_retval[0] = indx; return (0); bad: KASSERT(indx == -1, ("indx=%d, should be -1", indx)); NDFREE_IOCTLCAPS(&nd); falloc_abort(td, fp); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(struct thread *td, struct ocreat_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknodat_args { int fd; char *path; mode_t mode; dev_t dev; }; #endif int sys_mknodat(struct thread *td, struct mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #if defined(COMPAT_FREEBSD11) int freebsd11_mknod(struct thread *td, struct freebsd11_mknod_args *uap) { return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int freebsd11_mknodat(struct thread *td, struct freebsd11_mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #endif /* COMPAT_FREEBSD11 */ int kern_mknodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode, dev_t dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; struct nameidata nd; int error, whiteout = 0; AUDIT_ARG_MODE(mode); AUDIT_ARG_DEV(dev); switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = priv_check(td, PRIV_VFS_MKNOD_DEV); if (error == 0 && dev == VNOVAL) error = EINVAL; break; case S_IFWHT: error = priv_check(td, PRIV_VFS_MKNOD_WHT); break; case S_IFIFO: if (dev == 0) return (kern_mkfifoat(td, fd, path, pathseg, mode)); /* FALLTHROUGH */ default: error = EINVAL; break; } if (error != 0) return (error); NDPREINIT(&nd); restart: bwillwrite(); - NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | - NOCACHE, pathseg, path, fd, &cap_mknodat_rights); + NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | AUDITVNODE1 | NOCACHE, + pathseg, path, fd, &cap_mknodat_rights); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE_PNBUF(&nd); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); return (EEXIST); } else { VATTR_NULL(&vattr); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask; vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: panic("kern_mknod: invalid mode"); } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE_PNBUF(&nd); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) { if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); } } VOP_VPUT_PAIR(nd.ni_dvp, error == 0 && !whiteout ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE_PNBUF(&nd); if (error == ERELOOKUP) goto restart; return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif int sys_mkfifo(struct thread *td, struct mkfifo_args *uap) { return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkfifoat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap) { return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifoat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; struct nameidata nd; int error; AUDIT_ARG_MODE(mode); NDPREINIT(&nd); restart: bwillwrite(); - NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | - NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights); + NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | AUDITVNODE1 | NOCACHE, + pathseg, path, fd, &cap_mkfifoat_rights); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE_PNBUF(&nd); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE_PNBUF(&nd); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE_PNBUF(&nd); if (error == ERELOOKUP) goto restart; return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif int sys_link(struct thread *td, struct link_args *uap) { return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link, UIO_USERSPACE, AT_SYMLINK_FOLLOW)); } #ifndef _SYS_SYSPROTO_H_ struct linkat_args { int fd1; char *path1; int fd2; char *path2; int flag; }; #endif int sys_linkat(struct thread *td, struct linkat_args *uap) { return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2, UIO_USERSPACE, uap->flag)); } int hardlink_check_uid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, &hardlink_check_uid, 0, "Unprivileged processes cannot create hard links to files owned by other " "users"); static int hardlink_check_gid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, &hardlink_check_gid, 0, "Unprivileged processes cannot create hard links to files owned by other " "groups"); static int can_hardlink(struct vnode *vp, struct ucred *cred) { struct vattr va; int error; if (!hardlink_check_uid && !hardlink_check_gid) return (0); error = VOP_GETATTR(vp, &va, cred); if (error != 0) return (error); if (hardlink_check_uid && cred->cr_uid != va.va_uid) { error = priv_check_cred(cred, PRIV_VFS_LINK); if (error != 0) return (error); } if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { error = priv_check_cred(cred, PRIV_VFS_LINK); if (error != 0) return (error); } return (0); } int kern_linkat(struct thread *td, int fd1, int fd2, const char *path1, const char *path2, enum uio_seg segflag, int flag) { struct nameidata nd; int error; if ((flag & ~(AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); NDPREINIT(&nd); do { bwillwrite(); NDINIT_ATRIGHTS(&nd, LOOKUP, AUDITVNODE1 | at2cnpflags(flag, AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH), segflag, path1, fd1, &cap_linkat_source_rights); if ((error = namei(&nd)) != 0) return (error); NDFREE_PNBUF(&nd); if ((nd.ni_resflags & NIRES_EMPTYPATH) != 0) { error = priv_check(td, PRIV_VFS_FHOPEN); if (error != 0) { vrele(nd.ni_vp); return (error); } } error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag); } while (error == EAGAIN || error == ERELOOKUP); return (error); } static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, enum uio_seg segflag) { struct nameidata nd; struct mount *mp; int error; if (vp->v_type == VDIR) { vrele(vp); return (EPERM); /* POSIX */ } NDINIT_ATRIGHTS(&nd, CREATE, - LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd, + LOCKPARENT | AUDITVNODE2 | NOCACHE, segflag, path, fd, &cap_linkat_target_rights); if ((error = namei(&nd)) == 0) { if (nd.ni_vp != NULL) { NDFREE_PNBUF(&nd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); vrele(vp); return (EEXIST); } else if (nd.ni_dvp->v_mount != vp->v_mount) { /* * Cross-device link. No need to recheck * vp->v_type, since it cannot change, except * to VBAD. */ NDFREE_PNBUF(&nd); vput(nd.ni_dvp); vrele(vp); return (EXDEV); } else if (vn_lock(vp, LK_EXCLUSIVE) == 0) { error = can_hardlink(vp, td->td_ucred); #ifdef MAC if (error == 0) error = mac_vnode_check_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); #endif if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE_PNBUF(&nd); return (error); } error = vn_start_write(vp, &mp, V_NOWAIT); if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE_PNBUF(&nd); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); return (EAGAIN); } error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_VPUT_PAIR(nd.ni_dvp, &vp, true); vn_finished_write(mp); NDFREE_PNBUF(&nd); vp = NULL; } else { vput(nd.ni_dvp); NDFREE_PNBUF(&nd); vrele(vp); return (EAGAIN); } } if (vp != NULL) vrele(vp); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif int sys_symlink(struct thread *td, struct symlink_args *uap) { return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct symlinkat_args { char *path; int fd; char *path2; }; #endif int sys_symlinkat(struct thread *td, struct symlinkat_args *uap) { return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2, UIO_USERSPACE)); } int kern_symlinkat(struct thread *td, const char *path1, int fd, const char *path2, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; const char *syspath; char *tmppath; struct nameidata nd; int error; if (segflg == UIO_SYSSPACE) { syspath = path1; } else { tmppath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path1, tmppath, MAXPATHLEN, NULL)) != 0) goto out; syspath = tmppath; } AUDIT_ARG_TEXT(syspath); NDPREINIT(&nd); restart: bwillwrite(); - NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | - NOCACHE, segflg, path2, fd, &cap_symlinkat_rights); + NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | AUDITVNODE1 | NOCACHE, segflg, + path2, fd, &cap_symlinkat_rights); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE_PNBUF(&nd); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); nd.ni_vp = NULL; error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE_PNBUF(&nd); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; goto restart; } VATTR_NULL(&vattr); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_pd->pd_cmask; #ifdef MAC vattr.va_type = VLNK; error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out2; #endif error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); #ifdef MAC out2: #endif VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE_PNBUF(&nd); if (error == ERELOOKUP) goto restart; out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, tmppath); return (error); } /* * Delete a whiteout from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct undelete_args { char *path; }; #endif int sys_undelete(struct thread *td, struct undelete_args *uap) { struct mount *mp; struct nameidata nd; int error; NDPREINIT(&nd); restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1, UIO_USERSPACE, uap->path); error = namei(&nd); if (error != 0) return (error); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE_PNBUF(&nd); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE_PNBUF(&nd); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE_PNBUF(&nd); vput(nd.ni_dvp); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif int sys_unlink(struct thread *td, struct unlink_args *uap) { return (kern_funlinkat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE, 0, 0)); } static int kern_funlinkat_ex(struct thread *td, int dfd, const char *path, int fd, int flag, enum uio_seg pathseg, ino_t oldinum) { if ((flag & ~(AT_REMOVEDIR | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); if ((flag & AT_REMOVEDIR) != 0) return (kern_frmdirat(td, dfd, path, fd, UIO_USERSPACE, 0)); return (kern_funlinkat(td, dfd, path, fd, UIO_USERSPACE, 0, 0)); } #ifndef _SYS_SYSPROTO_H_ struct unlinkat_args { int fd; char *path; int flag; }; #endif int sys_unlinkat(struct thread *td, struct unlinkat_args *uap) { return (kern_funlinkat_ex(td, uap->fd, uap->path, FD_NONE, uap->flag, UIO_USERSPACE, 0)); } #ifndef _SYS_SYSPROTO_H_ struct funlinkat_args { int dfd; const char *path; int fd; int flag; }; #endif int sys_funlinkat(struct thread *td, struct funlinkat_args *uap) { return (kern_funlinkat_ex(td, uap->dfd, uap->path, uap->fd, uap->flag, UIO_USERSPACE, 0)); } int kern_funlinkat(struct thread *td, int dfd, const char *path, int fd, enum uio_seg pathseg, int flag, ino_t oldinum) { struct mount *mp; struct file *fp; struct vnode *vp; struct nameidata nd; struct stat sb; int error; fp = NULL; if (fd != FD_NONE) { error = getvnode_path(td, fd, &cap_no_rights, &fp); if (error != 0) return (error); } NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH), pathseg, path, dfd, &cap_unlinkat_rights); if ((error = namei(&nd)) != 0) { if (error == EINVAL) error = EPERM; goto fdout; } vp = nd.ni_vp; if (vp->v_type == VDIR && oldinum == 0) { error = EPERM; /* POSIX */ } else if (oldinum != 0 && ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED)) == 0) && sb.st_ino != oldinum) { error = EIDRM; /* Identifier removed */ } else if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) error = EBADF; else error = EDEADLK; } else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE_PNBUF(&nd); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) { goto fdout; } goto restart; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE_PNBUF(&nd); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); if (error == ERELOOKUP) goto restart; fdout: if (fp != NULL) fdrop(fp, td); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int sys_lseek(struct thread *td, struct lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } int kern_lseek(struct thread *td, int fd, off_t offset, int whence) { struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_seek_rights, &fp); if (error != 0) return (error); error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ? fo_seek(fp, offset, whence, td) : ESPIPE; fdrop(fp, td); return (error); } #if defined(COMPAT_43) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(struct thread *td, struct olseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Version with the 'pad' argument */ int freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* * Check access permissions using passed credentials. */ static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td) { accmode_t accmode; int error; /* Flags == 0 means only check for existence. */ if (user_flags == 0) return (0); accmode = 0; if (user_flags & R_OK) accmode |= VREAD; if (user_flags & W_OK) accmode |= VWRITE; if (user_flags & X_OK) accmode |= VEXEC; #ifdef MAC error = mac_vnode_check_access(cred, vp, accmode); if (error != 0) return (error); #endif if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, accmode, cred, td); return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int amode; }; #endif int sys_access(struct thread *td, struct access_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0, uap->amode)); } #ifndef _SYS_SYSPROTO_H_ struct faccessat_args { int dirfd; char *path; int amode; int flag; } #endif int sys_faccessat(struct thread *td, struct faccessat_args *uap) { return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->amode)); } int kern_accessat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int flag, int amode) { struct ucred *cred, *usecred; struct vnode *vp; struct nameidata nd; int error; if ((flag & ~(AT_EACCESS | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0) return (EINVAL); /* * Create and modify a temporary credential instead of one that * is potentially shared (if we need one). */ cred = td->td_ucred; if ((flag & AT_EACCESS) == 0 && ((cred->cr_uid != cred->cr_ruid || cred->cr_rgid != cred->cr_groups[0]))) { usecred = crdup(cred); usecred->cr_uid = cred->cr_ruid; usecred->cr_groups[0] = cred->cr_rgid; td->td_ucred = usecred; } else usecred = cred; AUDIT_ARG_VALUE(amode); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH | AT_EMPTY_PATH), pathseg, path, fd, &cap_fstat_rights); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; error = vn_access(vp, amode, usecred, td); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); vput(vp); out: if (usecred != cred) { td->td_ucred = cred; crfree(usecred); } return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int amode; }; #endif int sys_eaccess(struct thread *td, struct eaccess_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, AT_EACCESS, uap->amode)); } #if defined(COMPAT_43) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif int ostat(struct thread *td, struct ostat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif int olstat(struct thread *td, struct olstat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Convert from an old to a new stat structure. * XXX: many values are blindly truncated. */ void cvtstat(struct stat *st, struct ostat *ost) { bzero(ost, sizeof(*ost)); ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; ost->st_size = MIN(st->st_size, INT32_MAX); ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 */ #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int ino64_trunc_error; SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW, &ino64_trunc_error, 0, "Error on truncation of device, file or inode number, or link count"); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost) { ost->st_dev = st->st_dev; if (ost->st_dev != st->st_dev) { switch (ino64_trunc_error) { default: /* * Since dev_t is almost raw, don't clamp to the * maximum for case 2, but ignore the error. */ break; case 1: return (EOVERFLOW); } } ost->st_ino = st->st_ino; if (ost->st_ino != st->st_ino) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_ino = UINT32_MAX; break; } } ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; if (ost->st_nlink != st->st_nlink) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_nlink = UINT16_MAX; break; } } ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (ost->st_rdev != st->st_rdev) { switch (ino64_trunc_error) { default: break; case 1: return (EOVERFLOW); } } ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_size = st->st_size; ost->st_blocks = st->st_blocks; ost->st_blksize = st->st_blksize; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; ost->st_lspare = 0; ost->st_birthtim = st->st_birthtim; bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim), sizeof(*ost) - offsetof(struct freebsd11_stat, st_birthtim) - sizeof(ost->st_birthtim)); return (0); } int freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap) { struct fhandle fh; struct stat sb; struct freebsd11_stat osb; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } int freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->buf, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Get file status */ #ifndef _SYS_SYSPROTO_H_ struct fstatat_args { int fd; char *path; struct stat *buf; int flag; } #endif int sys_fstatat(struct thread *td, struct fstatat_args *uap) { struct stat sb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error == 0) error = copyout(&sb, uap->buf, sizeof (sb)); return (error); } int kern_statat(struct thread *td, int flag, int fd, const char *path, enum uio_seg pathseg, struct stat *sbp, void (*hook)(struct vnode *vp, struct stat *sbp)) { struct nameidata nd; int error; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_RESOLVE_BENEATH | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights); if ((error = namei(&nd)) != 0) { if (error == ENOTDIR && (nd.ni_resflags & NIRES_EMPTYPATH) != 0) error = kern_fstat(td, fd, sbp); return (error); } error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED); if (__predict_false(hook != NULL)) { if (error == 0) { hook(nd.ni_vp, sbp); } } - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); vput(nd.ni_vp); #ifdef __STAT_TIME_T_EXT sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; #endif #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Implementation of the NetBSD [l]stat() functions. */ int freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb) { struct freebsd11_stat sb11; int error; error = freebsd11_cvtstat(sb, &sb11); if (error != 0) return (error); bzero(nsb, sizeof(*nsb)); CP(sb11, *nsb, st_dev); CP(sb11, *nsb, st_ino); CP(sb11, *nsb, st_mode); CP(sb11, *nsb, st_nlink); CP(sb11, *nsb, st_uid); CP(sb11, *nsb, st_gid); CP(sb11, *nsb, st_rdev); CP(sb11, *nsb, st_atim); CP(sb11, *nsb, st_mtim); CP(sb11, *nsb, st_ctim); CP(sb11, *nsb, st_size); CP(sb11, *nsb, st_blocks); CP(sb11, *nsb, st_blksize); CP(sb11, *nsb, st_flags); CP(sb11, *nsb, st_gen); CP(sb11, *nsb, st_birthtim); return (0); } #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtnstat(&sb, &nsb); if (error == 0) error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nlstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtnstat(&sb, &nsb); if (error == 0) error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif int sys_pathconf(struct thread *td, struct pathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } #ifndef _SYS_SYSPROTO_H_ struct lpathconf_args { char *path; int name; }; #endif int sys_lpathconf(struct thread *td, struct lpathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_pathconf(struct thread *td, const char *path, enum uio_seg pathseg, int name, u_long flags, long *valuep) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags, pathseg, path); if ((error = namei(&nd)) != 0) return (error); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); error = VOP_PATHCONF(nd.ni_vp, name, valuep); vput(nd.ni_vp); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; size_t count; }; #endif int sys_readlink(struct thread *td, struct readlink_args *uap) { return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } #ifndef _SYS_SYSPROTO_H_ struct readlinkat_args { int fd; char *path; char *buf; size_t bufsize; }; #endif int sys_readlinkat(struct thread *td, struct readlinkat_args *uap) { return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->bufsize)); } int kern_readlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count) { struct vnode *vp; struct nameidata nd; int error; if (count > IOSIZE_MAX) return (EINVAL); NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | EMPTYPATH, pathseg, path, fd); if ((error = namei(&nd)) != 0) return (error); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); vp = nd.ni_vp; error = kern_readlink_vp(vp, buf, bufseg, count, td); vput(vp); return (error); } /* * Helper function to readlink from a vnode */ static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, struct thread *td) { struct iovec aiov; struct uio auio; int error; ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked"); #ifdef MAC error = mac_vnode_check_readlink(td->td_ucred, vp); if (error != 0) return (error); #endif if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0) return (EINVAL); aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(struct thread *td, struct vnode *vp, u_long flags) { struct mount *mp; struct vattr vattr; int error; /* We can't support the value matching VNOVAL. */ if (flags == VNOVAL) return (EOPNOTSUPP); /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error != 0) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VATTR_NULL(&vattr); vattr.va_flags = flags; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { const char *path; u_long flags; }; #endif int sys_chflags(struct thread *td, struct chflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, 0)); } #ifndef _SYS_SYSPROTO_H_ struct chflagsat_args { int fd; const char *path; u_long flags; int atflag; } #endif int sys_chflagsat(struct thread *td, struct chflagsat_args *uap) { return (kern_chflagsat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flags, uap->atflag)); } /* * Same as chflags() but doesn't follow symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchflags_args { const char *path; u_long flags; }; #endif int sys_lchflags(struct thread *td, struct lchflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, AT_SYMLINK_NOFOLLOW)); } static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag) { struct nameidata nd; int error; if ((atflag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); AUDIT_ARG_FFLAGS(flags); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(atflag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchflags_rights); if ((error = namei(&nd)) != 0) return (error); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); error = setfflags(td, nd.ni_vp, flags); vrele(nd.ni_vp); return (error); } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; u_long flags; }; #endif int sys_fchflags(struct thread *td, struct fchflags_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_FFLAGS(uap->flags); error = getvnode(td, uap->fd, &cap_fchflags_rights, &fp); if (error != 0) return (error); #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); } #endif error = setfflags(td, fp->f_vnode, uap->flags); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_vnode_check_setmode(cred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif int sys_chmod(struct thread *td, struct chmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchmodat_args { int dirfd; char *path; mode_t mode; int flag; } #endif int sys_fchmodat(struct thread *td, struct fchmodat_args *uap) { return (kern_fchmodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->flag)); } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif int sys_lchmod(struct thread *td, struct lchmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, AT_SYMLINK_NOFOLLOW)); } int kern_fchmodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, mode_t mode, int flag) { struct nameidata nd; int error; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); AUDIT_ARG_MODE(mode); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchmod_rights); if ((error = namei(&nd)) != 0) return (error); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); error = setfmode(td, td->td_ucred, nd.ni_vp, mode); vrele(nd.ni_vp); return (error); } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif int sys_fchmod(struct thread *td, struct fchmod_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_MODE(uap->mode); error = fget(td, uap->fd, &cap_fchmod_rights, &fp); if (error != 0) return (error); error = fo_chmod(fp, uap->mode, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_vnode_check_setowner(cred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif int sys_chown(struct thread *td, struct chown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchownat_args { int fd; const char * path; uid_t uid; gid_t gid; int flag; }; #endif int sys_fchownat(struct thread *td, struct fchownat_args *uap) { return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid, uap->gid, uap->flag)); } int kern_fchownat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int uid, int gid, int flag) { struct nameidata nd; int error; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); AUDIT_ARG_OWNER(uid, gid); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchown_rights); if ((error = namei(&nd)) != 0) return (error); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid); vrele(nd.ni_vp); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif int sys_lchown(struct thread *td, struct lchown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW)); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif int sys_fchown(struct thread *td, struct fchown_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_OWNER(uap->uid, uap->gid); error = fget(td, uap->fd, &cap_fchown_rights, &fp); if (error != 0) return (error); error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg, struct timespec *tsp) { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { vfs_timestamp(&tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return (0); } /* * Common implementation code for futimens(), utimensat(). */ #define UTIMENS_NULL 0x1 #define UTIMENS_EXIT 0x2 static int getutimens(const struct timespec *usrtsp, enum uio_seg tspseg, struct timespec *tsp, int *retflags) { struct timespec tsnow; int error; vfs_timestamp(&tsnow); *retflags = 0; if (usrtsp == NULL) { tsp[0] = tsnow; tsp[1] = tsnow; *retflags |= UTIMENS_NULL; return (0); } if (tspseg == UIO_SYSSPACE) { tsp[0] = usrtsp[0]; tsp[1] = usrtsp[1]; } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0) return (error); if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT) *retflags |= UTIMENS_EXIT; if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW) *retflags |= UTIMENS_NULL; if (tsp[0].tv_nsec == UTIME_OMIT) tsp[0].tv_sec = VNOVAL; else if (tsp[0].tv_nsec == UTIME_NOW) tsp[0] = tsnow; else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L) return (EINVAL); if (tsp[1].tv_nsec == UTIME_OMIT) tsp[1].tv_sec = VNOVAL; else if (tsp[1].tv_nsec == UTIME_NOW) tsp[1] = tsnow; else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L) return (EINVAL); return (0); } /* * Common implementation code for utimes(), lutimes(), futimes(), futimens(), * and utimensat(). */ static int setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts, int numtimes, int nullflag) { struct mount *mp; struct vattr vattr; int error; bool setbirthtime; setbirthtime = false; vattr.va_birthtime.tv_sec = VNOVAL; vattr.va_birthtime.tv_nsec = 0; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred) == 0 && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = true; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif int sys_utimes(struct thread *td, struct utimes_args *uap) { return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct futimesat_args { int fd; const char * path; const struct timeval * times; }; #endif int sys_futimesat(struct thread *td, struct futimesat_args *uap) { return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE)); } int kern_utimesat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, const struct timeval *tptr, enum uio_seg tptrseg) { struct nameidata nd; struct timespec ts[2]; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, &cap_futimes_rights); if ((error = namei(&nd)) != 0) return (error); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif int sys_lutimes(struct thread *td, struct lutimes_args *uap) { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, const char *path, enum uio_seg pathseg, const struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct nameidata nd; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path); if ((error = namei(&nd)) != 0) return (error); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif int sys_futimes(struct thread *td, struct futimes_args *uap) { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, const struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error; AUDIT_ARG_FD(fd); error = getutimes(tptr, tptrseg, ts); if (error != 0) return (error); error = getvnode(td, fd, &cap_futimes_rights, &fp); if (error != 0) return (error); #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); } #endif error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); fdrop(fp, td); return (error); } int sys_futimens(struct thread *td, struct futimens_args *uap) { return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE)); } int kern_futimens(struct thread *td, int fd, const struct timespec *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error, flags; AUDIT_ARG_FD(fd); error = getutimens(tptr, tptrseg, ts, &flags); if (error != 0) return (error); if (flags & UTIMENS_EXIT) return (0); error = getvnode(td, fd, &cap_futimes_rights, &fp); if (error != 0) return (error); #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); } #endif error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL); fdrop(fp, td); return (error); } int sys_utimensat(struct thread *td, struct utimensat_args *uap) { return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE, uap->flag)); } int kern_utimensat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, const struct timespec *tptr, enum uio_seg tptrseg, int flag) { struct nameidata nd; struct timespec ts[2]; int error, flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_futimes_rights); if ((error = namei(&nd)) != 0) return (error); /* * We are allowed to call namei() regardless of 2xUTIME_OMIT. * POSIX states: * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected." * "Search permission is denied by a component of the path prefix." */ - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); if ((flags & UTIMENS_EXIT) == 0) error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL); vrele(nd.ni_vp); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif int sys_truncate(struct thread *td, struct truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; void *rl_cookie; struct nameidata nd; int error; if (length < 0) return (EINVAL); NDPREINIT(&nd); retry: NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vn_rangelock_unlock(vp, rl_cookie); vrele(vp); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef MAC error = mac_vnode_check_write(td->td_ucred, NOCRED, vp); if (error != 0) goto out; #endif error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); if (error != 0) goto out; error = vn_truncate_locked(vp, length, false, td->td_ucred); out: VOP_UNLOCK(vp); vn_finished_write(mp); vn_rangelock_unlock(vp, rl_cookie); vrele(vp); if (error == ERELOOKUP) goto retry; return (error); } #if defined(COMPAT_43) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif int otruncate(struct thread *td, struct otruncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Versions with the pad argument */ int freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap) { return (kern_ftruncate(td, uap->fd, uap->length)); } #endif int kern_fsync(struct thread *td, int fd, bool fullsync) { struct vnode *vp; struct mount *mp; struct file *fp; int error; AUDIT_ARG_FD(fd); error = getvnode(td, fd, &cap_fsync_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; #if 0 if (!fullsync) /* XXXKIB: compete outstanding aio writes */; #endif retry: error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto drop; vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td); VOP_UNLOCK(vp); vn_finished_write(mp); if (error == ERELOOKUP) goto retry; drop: fdrop(fp, td); return (error); } /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif int sys_fsync(struct thread *td, struct fsync_args *uap) { return (kern_fsync(td, uap->fd, true)); } int sys_fdatasync(struct thread *td, struct fdatasync_args *uap) { return (kern_fsync(td, uap->fd, false)); } /* * Rename files. Source and destination must either both be directories, or * both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif int sys_rename(struct thread *td, struct rename_args *uap) { return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD, uap->to, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct renameat_args { int oldfd; char *old; int newfd; char *new; }; #endif int sys_renameat(struct thread *td, struct renameat_args *uap) { return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new, UIO_USERSPACE)); } #ifdef MAC static int kern_renameat_mac(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg, struct nameidata *fromnd) { int error; NDINIT_ATRIGHTS(fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights); if ((error = namei(fromnd)) != 0) return (error); error = mac_vnode_check_rename_from(td->td_ucred, fromnd->ni_dvp, fromnd->ni_vp, &fromnd->ni_cnd); VOP_UNLOCK(fromnd->ni_dvp); if (fromnd->ni_dvp != fromnd->ni_vp) VOP_UNLOCK(fromnd->ni_vp); if (error != 0) { NDFREE_PNBUF(fromnd); vrele(fromnd->ni_dvp); vrele(fromnd->ni_vp); if (fromnd->ni_startdir) vrele(fromnd->ni_startdir); } return (error); } #endif int kern_renameat(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; uint64_t tondflags; int error; again: bwillwrite(); #ifdef MAC if (mac_vnode_check_rename_from_enabled()) { error = kern_renameat_mac(td, oldfd, old, newfd, new, pathseg, &fromnd); if (error != 0) return (error); } else { #endif NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights); if ((error = namei(&fromnd)) != 0) return (error); #ifdef MAC } #endif fvp = fromnd.ni_vp; tondflags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNODE2; if (fromnd.ni_vp->v_type == VDIR) tondflags |= WILLBEDIR; NDINIT_ATRIGHTS(&tond, RENAME, tondflags, pathseg, new, newfd, &cap_renameat_target_rights); if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE_PNBUF(&fromnd); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } tdvp = tond.ni_dvp; tvp = tond.ni_vp; error = vn_start_write(fvp, &mp, V_NOWAIT); if (error != 0) { NDFREE_PNBUF(&fromnd); NDFREE_PNBUF(&tond); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); vrele(tond.ni_startdir); if (fromnd.ni_startdir != NULL) vrele(fromnd.ni_startdir); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); goto again; } if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef CAPABILITIES if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) { /* * If the target already exists we require CAP_UNLINKAT * from 'newfd', when newfd was used for the lookup. */ error = cap_check(&tond.ni_filecaps.fc_rights, &cap_unlinkat_rights); if (error != 0) goto out; } #endif } if (fvp == tdvp) { error = EINVAL; goto out; } /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = ERESTART; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (error == 0) { error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE_PNBUF(&fromnd); NDFREE_PNBUF(&tond); } else { NDFREE_PNBUF(&fromnd); NDFREE_PNBUF(&tond); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); if (error == ERESTART) return (0); if (error == ERELOOKUP) goto again; return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif int sys_mkdir(struct thread *td, struct mkdir_args *uap) { return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkdirat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkdirat(struct thread *td, struct mkdirat_args *uap) { return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vattr vattr; struct nameidata nd; int error; AUDIT_ARG_MODE(mode); NDPREINIT(&nd); restart: bwillwrite(); - NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | + NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | AUDITVNODE1 | NC_NOMAKEENTRY | NC_KEEPPOSENTRY | FAILIFEXISTS | WILLBEDIR, segflg, path, fd, &cap_mkdirat_rights); if ((error = namei(&nd)) != 0) return (error); if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE_PNBUF(&nd); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_pd->pd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE_PNBUF(&nd); VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif int sys_rmdir(struct thread *td, struct rmdir_args *uap) { return (kern_frmdirat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE, 0)); } int kern_frmdirat(struct thread *td, int dfd, const char *path, int fd, enum uio_seg pathseg, int flag) { struct mount *mp; struct vnode *vp; struct file *fp; struct nameidata nd; cap_rights_t rights; int error; fp = NULL; if (fd != FD_NONE) { error = getvnode(td, fd, cap_rights_init_one(&rights, CAP_LOOKUP), &fp); if (error != 0) return (error); } NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH), pathseg, path, dfd, &cap_unlinkat_rights); if ((error = namei(&nd)) != 0) goto fdout; vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) error = EBADF; else error = EDEADLK; goto out; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE_PNBUF(&nd); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto fdout; goto restart; } vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE_PNBUF(&nd); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (error == ERELOOKUP) goto restart; fdout: if (fp != NULL) fdrop(fp, td); return (error); } #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count, long *basep, void (*func)(struct freebsd11_dirent *)) { struct freebsd11_dirent dstdp; struct dirent *dp, *edp; char *dirbuf; off_t base; ssize_t resid, ucount; int error; /* XXX arbitrary sanity limit on `count'. */ count = min(count, 64 * 1024); dirbuf = malloc(count, M_TEMP, M_WAITOK); error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid, UIO_SYSSPACE); if (error != 0) goto done; if (basep != NULL) *basep = base; ucount = 0; for (dp = (struct dirent *)dirbuf, edp = (struct dirent *)&dirbuf[count - resid]; ucount < count && dp < edp; ) { if (dp->d_reclen == 0) break; MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0)); if (dp->d_namlen >= sizeof(dstdp.d_name)) continue; dstdp.d_type = dp->d_type; dstdp.d_namlen = dp->d_namlen; dstdp.d_fileno = dp->d_fileno; /* truncate */ if (dstdp.d_fileno != dp->d_fileno) { switch (ino64_trunc_error) { default: case 0: break; case 1: error = EOVERFLOW; goto done; case 2: dstdp.d_fileno = UINT32_MAX; break; } } dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) + ((dp->d_namlen + 1 + 3) &~ 3); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); bzero(dstdp.d_name + dstdp.d_namlen, dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) - dstdp.d_namlen); MPASS(dstdp.d_reclen <= dp->d_reclen); MPASS(ucount + dstdp.d_reclen <= count); if (func != NULL) func(&dstdp); error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen); if (error != 0) break; dp = (struct dirent *)((char *)dp + dp->d_reclen); ucount += dstdp.d_reclen; } done: free(dirbuf, M_TEMP); if (error == 0) td->td_retval[0] = ucount; return (error); } #endif /* COMPAT */ #ifdef COMPAT_43 static void ogetdirentries_cvt(struct freebsd11_dirent *dp) { #if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; #else /* * The dp->d_type is the high byte of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; #endif } /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(struct thread *td, struct ogetdirentries_args *uap) { long loff; int error; error = kern_ogetdirentries(td, uap, &loff); if (error == 0) error = copyout(&loff, uap->basep, sizeof(long)); return (error); } int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap, long *ploff) { long base; int error; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, ogetdirentries_cvt); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) #ifndef _SYS_SYSPROTO_H_ struct freebsd11_getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int freebsd11_getdirentries(struct thread *td, struct freebsd11_getdirentries_args *uap) { long base; int error; error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } int freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap) { struct freebsd11_getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return (freebsd11_getdirentries(td, &ap)); } #endif /* COMPAT_FREEBSD11 */ /* * Read a block of directory entries in a filesystem independent format. */ int sys_getdirentries(struct thread *td, struct getdirentries_args *uap) { off_t base; int error; error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL, UIO_USERSPACE); if (error != 0) return (error); if (uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(off_t)); return (error); } int kern_getdirentries(struct thread *td, int fd, char *buf, size_t count, off_t *basep, ssize_t *residp, enum uio_seg bufseg) { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; off_t loff; int error, eofflag; off_t foffset; AUDIT_ARG_FD(fd); if (count > IOSIZE_MAX) return (EINVAL); auio.uio_resid = count; error = getvnode(td, fd, &cap_read_rights, &fp); if (error != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; foffset = foffset_lock(fp, 0); unionread: if (vp->v_type != VDIR) { error = EINVAL; goto fail; } if (__predict_false((vp->v_vflag & VV_UNLINKED) != 0)) { error = ENOENT; goto fail; } aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); loff = auio.uio_offset = foffset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); foffset = auio.uio_offset; if (error != 0) { VOP_UNLOCK(vp); goto fail; } if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; foffset = 0; vput(tvp); goto unionread; } VOP_UNLOCK(vp); *basep = loff; if (residp != NULL) *residp = auio.uio_resid; td->td_retval[0] = count - auio.uio_resid; fail: foffset_unlock(fp, foffset, 0); fdrop(fp, td); return (error); } /* * Set the mode mask for creation of filesystem nodes. */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int sys_umask(struct thread *td, struct umask_args *uap) { struct pwddesc *pdp; pdp = td->td_proc->p_pd; PWDDESC_XLOCK(pdp); td->td_retval[0] = pdp->pd_cmask; pdp->pd_cmask = uap->newmask & ALLPERMS; PWDDESC_XUNLOCK(pdp); return (0); } /* * Void all references to file by ripping underlying filesystem away from * vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif int sys_revoke(struct thread *td, struct revoke_args *uap) { struct vnode *vp; struct vattr vattr; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); if (vp->v_type != VCHR || vp->v_rdev == NULL) { error = EINVAL; goto out; } #ifdef MAC error = mac_vnode_check_revoke(td->td_ucred, vp); if (error != 0) goto out; #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) goto out; if (td->td_ucred->cr_uid != vattr.va_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto out; } if (devfs_usecount(vp) > 0) VOP_REVOKE(vp, REVOKEALL); out: vput(vp); return (error); } /* * This variant of getvnode() allows O_PATH files. Caller should * ensure that returned file and vnode are only used for compatible * semantics. */ int getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { struct file *fp; int error; error = fget_unlocked(td, fd, rightsp, &fp); if (error != 0) return (error); /* * The file could be not of the vnode type, or it may be not * yet fully initialized, in which case the f_vnode pointer * may be set, but f_ops is still badfileops. E.g., * devfs_open() transiently create such situation to * facilitate csw d_fdopen(). * * Dupfdopen() handling in kern_openat() installs the * half-baked file into the process descriptor table, allowing * other thread to dereference it. Guard against the race by * checking f_ops. */ if (__predict_false(fp->f_vnode == NULL || fp->f_ops == &badfileops)) { fdrop(fp, td); *fpp = NULL; return (EINVAL); } *fpp = fp; return (0); } /* * Convert a user file descriptor to a kernel file entry and check * that, if it is a capability, the correct rights are present. * A reference on the file entry is held upon returning. */ int getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { int error; error = getvnode_path(td, fd, rightsp, fpp); if (__predict_false(error != 0)) return (error); /* * Filter out O_PATH file descriptors, most getvnode() callers * do not call fo_ methods. */ if (__predict_false((*fpp)->f_ops == &path_fileops)) { fdrop(*fpp, td); *fpp = NULL; error = EBADF; } return (error); } /* * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct lgetfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_lgetfh(struct thread *td, struct lgetfh_args *uap) { return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_getfh(struct thread *td, struct getfh_args *uap) { return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } /* * syscall for the rpc.lockd to use to translate an open descriptor into * a NFS file handle. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct getfhat_args { int fd; char *path; fhandle_t *fhp; int flags; }; #endif int sys_getfhat(struct thread *td, struct getfhat_args *uap) { return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } int kern_getfhat(struct thread *td, int flags, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp, enum uio_seg fhseg) { struct nameidata nd; fhandle_t fh; struct vnode *vp; int error; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); NDINIT_AT(&nd, LOOKUP, at2cnpflags(flags, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH) | LOCKLEAF | AUDITVNODE1, pathseg, path, fd); error = namei(&nd); if (error != 0) return (error); - NDFREE_NOTHING(&nd); + NDFREE_PNBUF(&nd); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error == 0) { if (fhseg == UIO_USERSPACE) error = copyout(&fh, fhp, sizeof (fh)); else memcpy(fhp, &fh, sizeof(fh)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct fhlink_args { fhandle_t *fhp; const char *to; }; #endif int sys_fhlink(struct thread *td, struct fhlink_args *uap) { return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp)); } #ifndef _SYS_SYSPROTO_H_ struct fhlinkat_args { fhandle_t *fhp; int tofd; const char *to; }; #endif int sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap) { return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp)); } static int kern_fhlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp) { fhandle_t fh; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); error = copyin(fhp, &fh, sizeof(fh)); if (error != 0) return (error); do { bwillwrite(); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); vfs_unbusy(mp); if (error != 0) return (error); VOP_UNLOCK(vp); error = kern_linkat_vp(td, vp, fd, path, pathseg); } while (error == EAGAIN || error == ERELOOKUP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fhreadlink_args { fhandle_t *fhp; char *buf; size_t bufsize; }; #endif int sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap) { fhandle_t fh; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); if (uap->bufsize > IOSIZE_MAX) return (EINVAL); error = copyin(uap->fhp, &fh, sizeof(fh)); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td); vput(vp); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into an * open descriptor. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int sys_fhopen(struct thread *td, struct fhopen_args *uap) { return (kern_fhopen(td, uap->u_fhp, uap->flags)); } int kern_fhopen(struct thread *td, const struct fhandle *u_fhp, int flags) { struct mount *mp; struct vnode *vp; struct fhandle fhp; struct file *fp; int fmode, error; int indx; error = priv_check(td, PRIV_VFS_FHOPEN); if (error != 0) return (error); indx = -1; fmode = FFLAGS(flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(u_fhp, &fhp, sizeof(fhp)); if (error != 0) return(error); /* find the mount point */ mp = vfs_busyfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = falloc_noinstall(td, &fp); if (error != 0) { vput(vp); return (error); } /* * An extra reference on `fp' has been held for us by * falloc_noinstall(). */ #ifdef INVARIANTS td->td_dupfd = -1; #endif error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp); if (error != 0) { KASSERT(fp->f_ops == &badfileops, ("VOP_OPEN in fhopen() set f_ops")); KASSERT(td->td_dupfd < 0, ("fhopen() encountered fdopen()")); vput(vp); goto bad; } #ifdef INVARIANTS td->td_dupfd = 0; #endif fp->f_vnode = vp; finit_vnode(fp, fmode, NULL, &vnops); VOP_UNLOCK(vp); if ((fmode & O_TRUNC) != 0) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } error = finstall(td, fp, &indx, fmode, NULL); bad: fdrop(fp, td); td->td_retval[0] = indx; return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int sys_fhstat(struct thread *td, struct fhstat_args *uap) { struct stat sb; struct fhandle fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fh)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error == 0) error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } int kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTAT); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = VOP_STAT(vp, sb, td->td_ucred, NOCRED); vput(vp); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap) { struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(*sfp)); free(sfp, M_STATFS); return (error); } int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTATFS); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); if (error != 0) { vfs_unbusy(mp); return (error); } vput(vp); error = prison_canseemount(td->td_ucred, mp); if (error != 0) goto out; #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif error = VFS_STATFS(mp, buf); out: vfs_unbusy(mp); return (error); } /* * Unlike madvise(2), we do not make a best effort to remember every * possible caching hint. Instead, we remember the last setting with * the exception that we will allow POSIX_FADV_NORMAL to adjust the * region of any current setting. */ int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, int advice) { struct fadvise_info *fa, *new; struct file *fp; struct vnode *vp; off_t end; int error; if (offset < 0 || len < 0 || offset > OFF_MAX - len) return (EINVAL); AUDIT_ARG_VALUE(advice); switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK); break; case POSIX_FADV_NORMAL: case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: new = NULL; break; default: return (EINVAL); } /* XXX: CAP_POSIX_FADVISE? */ AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_no_rights, &fp); if (error != 0) goto out; AUDIT_ARG_FILE(td->td_proc, fp); if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { error = ESPIPE; goto out; } if (fp->f_type != DTYPE_VNODE) { error = ENODEV; goto out; } vp = fp->f_vnode; if (vp->v_type != VREG) { error = ENODEV; goto out; } if (len == 0) end = OFF_MAX; else end = offset + len - 1; switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: /* * Try to merge any existing non-standard region with * this new region if possible, otherwise create a new * non-standard region for this request. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL && fa->fa_advice == advice && ((fa->fa_start <= end && fa->fa_end >= offset) || (end != OFF_MAX && fa->fa_start == end + 1) || (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) { if (offset < fa->fa_start) fa->fa_start = offset; if (end > fa->fa_end) fa->fa_end = end; } else { new->fa_advice = advice; new->fa_start = offset; new->fa_end = end; fp->f_advice = new; new = fa; } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_NORMAL: /* * If a the "normal" region overlaps with an existing * non-standard region, trim or remove the * non-standard region. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL) { if (offset <= fa->fa_start && end >= fa->fa_end) { new = fa; fp->f_advice = NULL; } else if (offset <= fa->fa_start && end >= fa->fa_start) fa->fa_start = end + 1; else if (offset <= fa->fa_end && end >= fa->fa_end) fa->fa_end = offset - 1; else if (offset >= fa->fa_start && end <= fa->fa_end) { /* * If the "normal" region is a middle * portion of the existing * non-standard region, just remove * the whole thing rather than picking * one side or the other to * preserve. */ new = fa; fp->f_advice = NULL; } } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: error = VOP_ADVISE(vp, offset, end, advice); break; } out: if (fp != NULL) fdrop(fp, td); free(new, M_FADVISE); return (error); } int sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) { int error; error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, uap->advice); return (kern_posix_error(td, error)); } int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, off_t *outoffp, size_t len, unsigned int flags) { struct file *infp, *outfp; struct vnode *invp, *outvp; int error; size_t retlen; void *rl_rcookie, *rl_wcookie; off_t savinoff, savoutoff; infp = outfp = NULL; rl_rcookie = rl_wcookie = NULL; savinoff = -1; error = 0; retlen = 0; if (flags != 0) { error = EINVAL; goto out; } if (len > SSIZE_MAX) /* * Although the len argument is size_t, the return argument * is ssize_t (which is signed). Therefore a size that won't * fit in ssize_t can't be returned. */ len = SSIZE_MAX; /* Get the file structures for the file descriptors. */ error = fget_read(td, infd, &cap_read_rights, &infp); if (error != 0) goto out; if (infp->f_ops == &badfileops) { error = EBADF; goto out; } if (infp->f_vnode == NULL) { error = EINVAL; goto out; } error = fget_write(td, outfd, &cap_write_rights, &outfp); if (error != 0) goto out; if (outfp->f_ops == &badfileops) { error = EBADF; goto out; } if (outfp->f_vnode == NULL) { error = EINVAL; goto out; } /* Set the offset pointers to the correct place. */ if (inoffp == NULL) inoffp = &infp->f_offset; if (outoffp == NULL) outoffp = &outfp->f_offset; savinoff = *inoffp; savoutoff = *outoffp; invp = infp->f_vnode; outvp = outfp->f_vnode; /* Sanity check the f_flag bits. */ if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || (infp->f_flag & FREAD) == 0) { error = EBADF; goto out; } /* If len == 0, just return 0. */ if (len == 0) goto out; /* * If infp and outfp refer to the same file, the byte ranges cannot * overlap. */ if (invp == outvp && ((savinoff <= savoutoff && savinoff + len > savoutoff) || (savinoff > savoutoff && savoutoff + len > savinoff))) { error = EINVAL; goto out; } /* Range lock the byte ranges for both invp and outvp. */ for (;;) { rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp + len); rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp + len); if (rl_rcookie != NULL) break; vn_rangelock_unlock(outvp, rl_wcookie); rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len); vn_rangelock_unlock(invp, rl_rcookie); } retlen = len; error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, flags, infp->f_cred, outfp->f_cred, td); out: if (rl_rcookie != NULL) vn_rangelock_unlock(invp, rl_rcookie); if (rl_wcookie != NULL) vn_rangelock_unlock(outvp, rl_wcookie); if (savinoff != -1 && (error == EINTR || error == ERESTART)) { *inoffp = savinoff; *outoffp = savoutoff; } if (outfp != NULL) fdrop(outfp, td); if (infp != NULL) fdrop(infp, td); td->td_retval[0] = retlen; return (error); } int sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) { off_t inoff, outoff, *inoffp, *outoffp; int error; inoffp = outoffp = NULL; if (uap->inoffp != NULL) { error = copyin(uap->inoffp, &inoff, sizeof(off_t)); if (error != 0) return (error); inoffp = &inoff; } if (uap->outoffp != NULL) { error = copyin(uap->outoffp, &outoff, sizeof(off_t)); if (error != 0) return (error); outoffp = &outoff; } error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, outoffp, uap->len, uap->flags); if (error == 0 && uap->inoffp != NULL) error = copyout(inoffp, uap->inoffp, sizeof(off_t)); if (error == 0 && uap->outoffp != NULL) error = copyout(outoffp, uap->outoffp, sizeof(off_t)); return (error); } diff --git a/sys/sys/namei.h b/sys/sys/namei.h index f6f6546342c0..75e4051d8879 100644 --- a/sys/sys/namei.h +++ b/sys/sys/namei.h @@ -1,327 +1,317 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1985, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)namei.h 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_NAMEI_H_ #define _SYS_NAMEI_H_ #include #include #include #include #include enum nameiop { LOOKUP, CREATE, DELETE, RENAME }; struct componentname { /* * Arguments to lookup. */ u_int64_t cn_origflags; /* flags to namei */ u_int64_t cn_flags; /* flags to namei */ struct ucred *cn_cred; /* credentials */ enum nameiop cn_nameiop; /* namei operation */ int cn_lkflags; /* Lock flags LK_EXCLUSIVE or LK_SHARED */ /* * Shared between lookup and commit routines. */ char *cn_pnbuf; /* pathname buffer */ char *cn_nameptr; /* pointer to looked up name */ long cn_namelen; /* length of looked up component */ }; struct nameicap_tracker; TAILQ_HEAD(nameicap_tracker_head, nameicap_tracker); /* * Encapsulation of namei parameters. */ struct nameidata { /* * Arguments to namei/lookup. */ const char *ni_dirp; /* pathname pointer */ enum uio_seg ni_segflg; /* location of pathname */ cap_rights_t *ni_rightsneeded; /* rights required to look up vnode */ /* * Arguments to lookup. */ struct vnode *ni_startdir; /* starting directory */ struct vnode *ni_rootdir; /* logical root directory */ struct vnode *ni_topdir; /* logical top directory */ int ni_dirfd; /* starting directory for *at functions */ int ni_lcf; /* local call flags */ /* * Results: returned from namei */ struct filecaps ni_filecaps; /* rights the *at base has */ /* * Results: returned from/manipulated by lookup */ struct vnode *ni_vp; /* vnode of result */ struct vnode *ni_dvp; /* vnode of intermediate directory */ /* * Results: flags returned from namei */ u_int ni_resflags; /* * Debug for validating API use by the callers. */ u_short ni_debugflags; /* * Shared between namei and lookup/commit routines. */ u_short ni_loopcnt; /* count of symlinks encountered */ size_t ni_pathlen; /* remaining chars in path */ char *ni_next; /* next location in pathname */ /* * Lookup parameters: this structure describes the subset of * information from the nameidata structure that is passed * through the VOP interface. */ struct componentname ni_cnd; struct nameicap_tracker_head ni_cap_tracker; /* * Private helper data for UFS, must be at the end. See * NDINIT_PREFILL(). */ seqc_t ni_dvp_seqc; seqc_t ni_vp_seqc; }; #ifdef _KERNEL enum cache_fpl_status { CACHE_FPL_STATUS_DESTROYED, CACHE_FPL_STATUS_ABORTED, CACHE_FPL_STATUS_PARTIAL, CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET }; int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, struct pwd **pwdp); /* * Flags for namei. * * If modifying the list make sure to check whether NDVALIDATE needs updating. */ /* * Debug. */ #define NAMEI_DBG_INITED 0x0001 #define NAMEI_DBG_CALLED 0x0002 #define NAMEI_DBG_HADSTARTDIR 0x0004 /* * namei operational modifier flags, stored in ni_cnd.flags */ #define NC_NOMAKEENTRY 0x0001 /* name must not be added to cache */ #define NC_KEEPPOSENTRY 0x0002 /* don't evict a positive entry */ #define NOCACHE NC_NOMAKEENTRY /* for compatibility with older code */ #define LOCKLEAF 0x0004 /* lock vnode on return */ #define LOCKPARENT 0x0008 /* want parent vnode returned locked */ #define WANTPARENT 0x0010 /* want parent vnode returned unlocked */ #define FAILIFEXISTS 0x0020 /* return EEXIST if found */ #define FOLLOW 0x0040 /* follow symbolic links */ #define EMPTYPATH 0x0080 /* Allow empty path for *at */ #define LOCKSHARED 0x0100 /* Shared lock leaf */ #define NOFOLLOW 0x0000 /* do not follow symbolic links (pseudo) */ #define RBENEATH 0x100000000ULL /* No escape, even tmp, from start dir */ #define MODMASK 0xf000001ffULL /* mask of operational modifiers */ /* * Namei parameter descriptors. - * - * SAVENAME may be set by either the callers of namei or by VOP_LOOKUP. - * If the caller of namei sets the flag (for example execve wants to - * know the name of the program that is being executed), then it must - * free the buffer. If VOP_LOOKUP sets the flag, then the buffer must - * be freed by either the commit routine or the VOP_ABORT routine. - * SAVESTART is set only by the callers of namei. It implies SAVENAME - * plus the addition of saving the parent directory that contains the - * name in ni_startdir. It allows repeated calls to lookup for the - * name being sought. The caller is responsible for releasing the - * buffer and for vrele'ing ni_startdir. */ #define RDONLY 0x00000200 /* lookup with read-only semantics */ -#define SAVENAME 0x00000400 /* save pathname buffer */ +/* UNUSED 0x00000400 */ #define SAVESTART 0x00000800 /* save starting directory */ #define ISWHITEOUT 0x00001000 /* found whiteout */ #define DOWHITEOUT 0x00002000 /* do whiteouts */ #define WILLBEDIR 0x00004000 /* new files will be dirs; allow trailing / */ #define ISOPEN 0x00008000 /* caller is opening; return a real vnode. */ #define NOCROSSMOUNT 0x00010000 /* do not cross mount points */ #define NOMACCHECK 0x00020000 /* do not perform MAC checks */ #define AUDITVNODE1 0x00040000 /* audit the looked up vnode information */ #define AUDITVNODE2 0x00080000 /* audit the looked up vnode information */ #define NOCAPCHECK 0x00100000 /* do not perform capability checks */ #define OPENREAD 0x00200000 /* open for reading */ #define OPENWRITE 0x00400000 /* open for writing */ #define WANTIOCTLCAPS 0x00800000 /* leave ioctl caps for the caller */ -#define HASBUF 0x01000000 /* has allocated pathname buffer */ +/* UNUSED 0x01000000 */ #define NOEXECCHECK 0x02000000 /* do not perform exec check on dir */ #define MAKEENTRY 0x04000000 /* entry is to be added to name cache */ #define ISSYMLINK 0x08000000 /* symlink needs interpretation */ #define ISLASTCN 0x10000000 /* this is last component of pathname */ #define ISDOTDOT 0x20000000 /* current component name is .. */ #define TRAILINGSLASH 0x40000000 /* path ended in a slash */ #define PARAMASK 0x7ffffe00 /* mask of parameter descriptors */ /* * Flags which must not be passed in by callers. */ #define NAMEI_INTERNAL_FLAGS \ - (HASBUF | NOEXECCHECK | MAKEENTRY | ISSYMLINK | ISLASTCN | ISDOTDOT | \ + (NOEXECCHECK | MAKEENTRY | ISSYMLINK | ISLASTCN | ISDOTDOT | \ TRAILINGSLASH) /* * Namei results flags */ #define NIRES_ABS 0x00000001 /* Path was absolute */ #define NIRES_STRICTREL 0x00000002 /* Restricted lookup result */ #define NIRES_EMPTYPATH 0x00000004 /* EMPTYPATH used */ /* * Flags in ni_lcf, valid for the duration of the namei call. */ #define NI_LCF_STRICTRELATIVE 0x0001 /* relative lookup only */ #define NI_LCF_CAP_DOTDOT 0x0002 /* ".." in strictrelative case */ /* * Initialization of a nameidata structure. */ #define NDINIT(ndp, op, flags, segflg, namep) \ NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, NULL, &cap_no_rights) #define NDINIT_AT(ndp, op, flags, segflg, namep, dirfd) \ NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, &cap_no_rights) #define NDINIT_ATRIGHTS(ndp, op, flags, segflg, namep, dirfd, rightsp) \ NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, rightsp) #define NDINIT_ATVP(ndp, op, flags, segflg, namep, vp) \ NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, vp, &cap_no_rights) /* * Note the constant pattern may *hide* bugs. * Note also that we enable debug checks for non-TIED KLDs * so that they can run on an INVARIANTS kernel without tripping over * assertions on ni_debugflags state. */ #if defined(INVARIANTS) || (defined(KLD_MODULE) && !defined(KLD_TIED)) #define NDINIT_PREFILL(arg) memset(arg, 0xff, offsetof(struct nameidata, \ ni_dvp_seqc)) #define NDINIT_DBG(arg) { (arg)->ni_debugflags = NAMEI_DBG_INITED; } #define NDREINIT_DBG(arg) { \ if (((arg)->ni_debugflags & NAMEI_DBG_INITED) == 0) \ panic("namei data not inited"); \ if (((arg)->ni_debugflags & NAMEI_DBG_HADSTARTDIR) != 0) \ panic("NDREINIT on namei data with NAMEI_DBG_HADSTARTDIR"); \ (arg)->ni_debugflags = NAMEI_DBG_INITED; \ } #else #define NDINIT_PREFILL(arg) do { } while (0) #define NDINIT_DBG(arg) do { } while (0) #define NDREINIT_DBG(arg) do { } while (0) #endif #define NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, startdir, rightsp) \ do { \ struct nameidata *_ndp = (ndp); \ cap_rights_t *_rightsp = (rightsp); \ MPASS(_rightsp != NULL); \ NDINIT_PREFILL(_ndp); \ NDINIT_DBG(_ndp); \ _ndp->ni_cnd.cn_nameiop = op; \ _ndp->ni_cnd.cn_flags = flags; \ _ndp->ni_segflg = segflg; \ _ndp->ni_dirp = namep; \ _ndp->ni_dirfd = dirfd; \ _ndp->ni_startdir = startdir; \ _ndp->ni_resflags = 0; \ filecaps_init(&_ndp->ni_filecaps); \ _ndp->ni_rightsneeded = _rightsp; \ } while (0) #define NDREINIT(ndp) do { \ struct nameidata *_ndp = (ndp); \ NDREINIT_DBG(_ndp); \ filecaps_free(&_ndp->ni_filecaps); \ _ndp->ni_resflags = 0; \ _ndp->ni_startdir = NULL; \ } while (0) #define NDPREINIT(ndp) do { \ (ndp)->ni_dvp_seqc = SEQC_MOD; \ (ndp)->ni_vp_seqc = SEQC_MOD; \ } while (0) #define NDF_NO_DVP_RELE 0x00000001 #define NDF_NO_DVP_UNLOCK 0x00000002 #define NDF_NO_DVP_PUT 0x00000003 #define NDF_NO_VP_RELE 0x00000004 #define NDF_NO_VP_UNLOCK 0x00000008 #define NDF_NO_VP_PUT 0x0000000c #define NDF_NO_STARTDIR_RELE 0x00000010 #define NDF_NO_FREE_PNBUF 0x00000020 #define NDFREE_IOCTLCAPS(ndp) do { \ struct nameidata *_ndp = (ndp); \ filecaps_free(&_ndp->ni_filecaps); \ } while (0) -void NDFREE_PNBUF(struct nameidata *); -void NDFREE(struct nameidata *, const u_int); -#ifdef INVARIANTS -void NDFREE_NOTHING(struct nameidata *); -#else -#define NDFREE_NOTHING(ndp) do { } while (0) -#endif +#define NDFREE_PNBUF(ndp) do { \ + struct nameidata *_ndp = (ndp); \ + MPASS(_ndp->ni_cnd.cn_pnbuf != NULL); \ + uma_zfree(namei_zone, _ndp->ni_cnd.cn_pnbuf); \ + _ndp->ni_cnd.cn_pnbuf = NULL; \ +} while (0) + +void NDFREE(struct nameidata *, const u_int); int namei(struct nameidata *ndp); int vfs_lookup(struct nameidata *ndp); int vfs_relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp); #endif /* * Stats on usefulness of namei caches. */ struct nchstats { long ncs_goodhits; /* hits that we can really use */ long ncs_neghits; /* negative hits that we can use */ long ncs_badhits; /* hits we must drop */ long ncs_falsehits; /* hits with id mismatch */ long ncs_miss; /* misses */ long ncs_long; /* long names that ignore cache */ long ncs_pass2; /* names found with passes == 2 */ long ncs_2passes; /* number of times we attempt it */ }; extern struct nchstats nchstats; #endif /* !_SYS_NAMEI_H_ */ diff --git a/sys/sys/param.h b/sys/sys/param.h index 2d22202b5fbd..e4ab2b31fe46 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -1,390 +1,390 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * documentation/content/en/books/porters-handbook/versions/_index.adoc * * Encoding: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * X.0-CURRENT before releng/X.0 is created, otherwise 'R' is * in the range 5 to 9. * Short hand: MMmmXXX * * __FreeBSD_version is bumped every time there's a change in the base system * that's noteworthy. A noteworthy change is any change which changes the * kernel's KBI in -CURRENT, one that changes some detail about the system that * external software (or the ports system) would want to know about, one that * adds a system call, one that adds or deletes a shipped library, a security * fix, or similar change not specifically noted here. Bumps should be limited * to one per day / a couple per week except for security fixes. * * The approved way to obtain this from a shell script is: * awk '/^\#define[[:space:]]*__FreeBSD_version/ {print $3}' * Other methods to parse this file may work, but are not guaranteed against * future changes. The above script works back to FreeBSD 3.x when this macro * was introduced. This number is propagated to other places needing it that * cannot include sys/param.h and should only be updated here. */ #undef __FreeBSD_version -#define __FreeBSD_version 1400067 +#define __FreeBSD_version 1400068 /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_WRFSBASE 1200041 #define P_OSREL_CK_CYLGRP 1200046 #define P_OSREL_VMTOTAL64 1200054 #define P_OSREL_CK_SUPERBLOCK 1300000 #define P_OSREL_CK_INODE 1300005 #define P_OSREL_POWERPC_NEW_AUX_ARGS 1300070 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 255 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL #ifndef LOCORE /* Signals. */ #include #endif #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define PNOLOCK 0x400 /* OR'd with pri to allow sleeping w/o a lock */ #define PRILASTFLAG 0x400 /* Last flag defined above */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) __align_down(x, y) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) __align_up(x, y) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). Since the intermediate * calculation is done with 64-bit precision, the maximum load average that can * be calculated is approximately 2^32 / FSCALE. * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11. This gives a maximum load avg of 2 million. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c index 9c6e95687f85..4c390f4c42ef 100644 --- a/sys/ufs/ufs/ufs_lookup.c +++ b/sys/ufs/ufs/ufs_lookup.c @@ -1,1585 +1,1579 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_lookup.c 8.15 (Berkeley) 6/16/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ufs.h" #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #include #include #include #ifdef DIAGNOSTIC static int dirchk = 1; #else static int dirchk = 0; #endif SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, ""); static int ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred, struct thread *td) { int error; #ifdef UFS_ACL /* * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt * * 3.16.2.1. ACE4_DELETE vs. ACE4_DELETE_CHILD */ /* * XXX: Is this check required? */ error = VOP_ACCESS(vdp, VEXEC, cred, td); if (error) return (error); error = VOP_ACCESSX(tdp, VDELETE, cred, td); if (error == 0) return (0); error = VOP_ACCESSX(vdp, VDELETE_CHILD, cred, td); if (error == 0) return (0); error = VOP_ACCESSX(vdp, VEXPLICIT_DENY | VDELETE_CHILD, cred, td); if (error) return (error); #endif /* !UFS_ACL */ /* * Standard Unix access control - delete access requires VWRITE. */ error = VOP_ACCESS(vdp, VWRITE, cred, td); if (error) return (error); /* * If directory is "sticky", then user must own * the directory, or the file in it, else she * may not delete it (unless she's root). This * implements append-only directories. */ if ((VTOI(vdp)->i_mode & ISVTX) && VOP_ACCESS(vdp, VADMIN, cred, td) && VOP_ACCESS(tdp, VADMIN, cred, td)) return (EPERM); return (0); } /* * Convert a component of a pathname into a pointer to a locked inode. * This is a very central and rather complicated routine. * If the filesystem is not maintained in a strict tree hierarchy, * this can result in a deadlock situation (see comments in code below). * * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending * on whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it and the target of the pathname * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may * be "."., but the caller must check to ensure it does an vrele and vput * instead of two vputs. * * This routine is actually used as VOP_CACHEDLOOKUP method, and the * filesystem employs the generic vfs_cache_lookup() as VOP_LOOKUP * method. * * vfs_cache_lookup() performs the following for us: * check that it is a directory * check accessibility of directory * check for modification attempts on read-only mounts * if name found in cache * if at end of path and deleting or creating * drop it * else * return name. * return VOP_CACHEDLOOKUP() * * Overall outline of ufs_lookup: * * search for name in directory, to found or notfound * notfound: * if creating, return locked directory, leaving info on available slots * else return error * found: * if at end of path and deleting, return information to allow delete * if at end of path and rewriting (RENAME and LOCKPARENT), lock target * inode and return info to allow rewrite * if not at end, add name to cache; if at end and neither creating * nor deleting, add name to cache */ int ufs_lookup( struct vop_cachedlookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap) { return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); } int ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, ino_t *dd_ino) { struct inode *dp; /* inode for directory being searched */ struct buf *bp; /* a buffer of directory entries */ struct direct *ep; /* the current directory entry */ int entryoffsetinblock; /* offset of ep in bp's buffer */ enum {NONE, COMPACT, FOUND} slotstatus; doff_t slotoffset; /* offset of area with free space */ doff_t i_diroff; /* cached i_diroff value. */ doff_t i_offset; /* cached i_offset value. */ int slotsize; /* size of area at slotoffset */ int slotfreespace; /* amount of space free in slot */ int slotneeded; /* size of the entry we're seeking */ int numdirpasses; /* strategy for directory search */ doff_t endsearch; /* offset to end directory search */ doff_t prevoff; /* prev entry dp->i_offset */ struct vnode *pdp; /* saved dp during symlink work */ struct vnode *tdp; /* returned by VFS_VGET */ doff_t enduseful; /* pointer past last used dir slot */ u_long bmask; /* block offset mask */ int namlen, error; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; ino_t ino, ino1; int ltype; if (vpp != NULL) *vpp = NULL; dp = VTOI(vdp); if (dp->i_effnlink == 0) return (ENOENT); /* * Create a vm object if vmiodirenable is enabled. * Alternatively we could call vnode_create_vobject * in VFS_VGET but we could end up creating objects * that are never used. */ vnode_create_vobject(vdp, DIP(dp, i_size), curthread); bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; #ifdef DEBUG_VFS_LOCKS /* * Assert that the directory vnode is locked, and locked * exclusively for the last component lookup for modifying * operations. * * The directory-modifying operations need to save * intermediate state in the inode between namei() call and * actual directory manipulations. See fields in the struct * inode marked as 'used during directory lookup'. We must * ensure that upgrade in namei() does not happen, since * upgrade might need to unlock vdp. If quotas are enabled, * getinoquota() also requires exclusive lock to modify inode. */ ASSERT_VOP_LOCKED(vdp, "ufs_lookup1"); if ((nameiop == CREATE || nameiop == DELETE || nameiop == RENAME) && (flags & (LOCKPARENT | ISLASTCN)) == (LOCKPARENT | ISLASTCN)) ASSERT_VOP_ELOCKED(vdp, "ufs_lookup2"); #endif restart: bp = NULL; slotoffset = -1; /* * We now have a segment name to search for, and a directory to search. * * Suppress search for slots unless creating * file and at end of pathname, in which case * we watch for a place to put the new file in * case it doesn't already exist. */ ino = 0; i_diroff = dp->i_diroff; slotstatus = FOUND; slotfreespace = slotsize = slotneeded = 0; if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) { slotstatus = NONE; slotneeded = DIRECTSIZ(cnp->cn_namelen); } #ifdef UFS_DIRHASH /* * Use dirhash for fast operations on large directories. The logic * to determine whether to hash the directory is contained within * ufsdirhash_build(); a zero return means that it decided to hash * this directory and it successfully built up the hash table. */ if (ufsdirhash_build(dp) == 0) { /* Look for a free slot if needed. */ enduseful = dp->i_size; if (slotstatus != FOUND) { slotoffset = ufsdirhash_findfree(dp, slotneeded, &slotsize); if (slotoffset >= 0) { slotstatus = COMPACT; enduseful = ufsdirhash_enduseful(dp); if (enduseful < 0) enduseful = dp->i_size; } } /* Look up the component. */ numdirpasses = 1; entryoffsetinblock = 0; /* silence compiler warning */ switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen, &i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) { case 0: ep = (struct direct *)((char *)bp->b_data + (i_offset & bmask)); goto foundentry; case ENOENT: i_offset = roundup2(dp->i_size, DIRBLKSIZ); goto notfound; default: /* Something failed; just do a linear search. */ break; } } #endif /* UFS_DIRHASH */ /* * If there is cached information on a previous search of * this directory, pick up where we last left off. * We cache only lookups as these are the most common * and have the greatest payoff. Caching CREATE has little * benefit as it usually must search the entire directory * to determine that the entry does not exist. Caching the * location of the last DELETE or RENAME has not reduced * profiling time and hence has been removed in the interest * of simplicity. */ if (nameiop != LOOKUP || i_diroff == 0 || i_diroff >= dp->i_size) { entryoffsetinblock = 0; i_offset = 0; numdirpasses = 1; } else { i_offset = i_diroff; if ((entryoffsetinblock = i_offset & bmask) && (error = UFS_BLKATOFF(vdp, (off_t)i_offset, NULL, &bp))) return (error); numdirpasses = 2; nchstats.ncs_2passes++; } prevoff = i_offset; endsearch = roundup2(dp->i_size, DIRBLKSIZ); enduseful = 0; searchloop: while (i_offset < endsearch) { /* * If necessary, get the next directory block. */ if ((i_offset & bmask) == 0) { if (bp != NULL) brelse(bp); error = UFS_BLKATOFF(vdp, (off_t)i_offset, NULL, &bp); if (error) return (error); entryoffsetinblock = 0; } /* * If still looking for a slot, and at a DIRBLKSIZE * boundary, have to start looking for free space again. */ if (slotstatus == NONE && (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) { slotoffset = -1; slotfreespace = 0; } /* * Get pointer to next entry. * Full validation checks are slow, so we only check * enough to insure forward progress through the * directory. Complete checks can be run by patching * "dirchk" to be true. */ ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock); if (ep->d_reclen == 0 || ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) { int i; ufs_dirbad(dp, i_offset, "mangled entry"); i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); i_offset += i; entryoffsetinblock += i; continue; } /* * If an appropriate sized slot has not yet been found, * check to see if one is available. Also accumulate space * in the current block so that we can determine if * compaction is viable. */ if (slotstatus != FOUND) { int size = ep->d_reclen; if (ep->d_ino != 0) size -= DIRSIZ(OFSFMT(vdp), ep); if (size > 0) { if (size >= slotneeded) { slotstatus = FOUND; slotoffset = i_offset; slotsize = ep->d_reclen; } else if (slotstatus == NONE) { slotfreespace += size; if (slotoffset == -1) slotoffset = i_offset; if (slotfreespace >= slotneeded) { slotstatus = COMPACT; slotsize = i_offset + ep->d_reclen - slotoffset; } } } } /* * Check for a name match. */ if (ep->d_ino) { # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(vdp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if (namlen == cnp->cn_namelen && (cnp->cn_nameptr[0] == ep->d_name[0]) && !bcmp(cnp->cn_nameptr, ep->d_name, (unsigned)namlen)) { #ifdef UFS_DIRHASH foundentry: #endif /* * Save directory entry's inode number and * reclen in ndp->ni_ufs area, and release * directory buffer. */ if (!OFSFMT(vdp) && ep->d_type == DT_WHT) { slotstatus = FOUND; slotoffset = i_offset; slotsize = ep->d_reclen; enduseful = dp->i_size; cnp->cn_flags |= ISWHITEOUT; numdirpasses--; goto notfound; } ino = ep->d_ino; goto found; } } prevoff = i_offset; i_offset += ep->d_reclen; entryoffsetinblock += ep->d_reclen; if (ep->d_ino) enduseful = i_offset; } notfound: /* * If we started in the middle of the directory and failed * to find our target, we must check the beginning as well. */ if (numdirpasses == 2) { numdirpasses--; i_offset = 0; endsearch = i_diroff; goto searchloop; } if (bp != NULL) brelse(bp); /* * If creating, and at end of pathname and current * directory has not been removed, then can consider * allowing file to be created. */ if ((nameiop == CREATE || nameiop == RENAME || (nameiop == DELETE && (cnp->cn_flags & DOWHITEOUT) && (cnp->cn_flags & ISWHITEOUT))) && (flags & ISLASTCN) && dp->i_effnlink != 0) { /* * Access for write is interpreted as allowing * creation of files in the directory. * * XXX: Fix the comment above. */ if (flags & WILLBEDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, curthread); else error = VOP_ACCESS(vdp, VWRITE, cred, curthread); if (error) return (error); /* * Return an indication of where the new directory * entry should be put. If we didn't find a slot, * then set dp->i_count to 0 indicating * that the new slot belongs at the end of the * directory. If we found a slot, then the new entry * can be put in the range from dp->i_offset to * dp->i_offset + dp->i_count. */ if (slotstatus == NONE) { SET_I_OFFSET(dp, roundup2(dp->i_size, DIRBLKSIZ)); SET_I_COUNT(dp, 0); enduseful = I_OFFSET(dp); } else if (nameiop == DELETE) { SET_I_OFFSET(dp, slotoffset); if ((I_OFFSET(dp) & (DIRBLKSIZ - 1)) == 0) SET_I_COUNT(dp, 0); else SET_I_COUNT(dp, I_OFFSET(dp) - prevoff); } else { SET_I_OFFSET(dp, slotoffset); SET_I_COUNT(dp, slotsize); if (enduseful < slotoffset + slotsize) enduseful = slotoffset + slotsize; } SET_I_ENDOFF(dp, roundup2(enduseful, DIRBLKSIZ)); /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to do a direnter(). * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. - * The pathname buffer is saved so that the name - * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ - cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } /* * Insert name into cache (as non-existent) if appropriate. */ if ((cnp->cn_flags & MAKEENTRY) != 0) cache_enter(vdp, NULL, cnp); return (ENOENT); found: if (dd_ino != NULL) *dd_ino = ino; if (numdirpasses == 2) nchstats.ncs_pass2++; /* * Check that directory length properly reflects presence * of this entry. */ if (i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) { ufs_dirbad(dp, i_offset, "i_size too small"); dp->i_size = i_offset + DIRSIZ(OFSFMT(vdp), ep); DIP_SET(dp, i_size, dp->i_size); UFS_INODE_SET_FLAG(dp, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); } brelse(bp); /* * Found component in pathname. * If the final component of path name, save information * in the cache as to where the entry was found. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = rounddown2(i_offset, DIRBLKSIZ); /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. */ if (nameiop == DELETE && (flags & ISLASTCN)) { if (flags & LOCKPARENT) ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); if (VOP_ISLOCKED(vdp) == LK_EXCLUSIVE) { /* * Return pointer to current entry in * dp->i_offset, and distance past previous * entry (if there is a previous entry in this * block) in dp->i_count. * * We shouldn't be setting these in the * WANTPARENT case (first lookup in rename()), but any * lookups that will result in directory changes will * overwrite these. */ SET_I_OFFSET(dp, i_offset); if ((I_OFFSET(dp) & (DIRBLKSIZ - 1)) == 0) SET_I_COUNT(dp, 0); else SET_I_COUNT(dp, I_OFFSET(dp) - prevoff); } if (dd_ino != NULL) return (0); /* * Save directory inode pointer in ndp->ni_dvp for * dirremove(). */ if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); error = ufs_delete_denied(vdp, tdp, cred, curthread); if (error) { vput(tdp); return (error); } if (dp->i_number == ino) { VREF(vdp); *vpp = vdp; vput(tdp); return (0); } *vpp = tdp; return (0); } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && (flags & ISLASTCN)) { if (flags & WILLBEDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, curthread); else error = VOP_ACCESS(vdp, VWRITE, cred, curthread); if (error) return (error); /* * Careful about locking second inode. * This can only occur if the target is ".". */ SET_I_OFFSET(dp, i_offset); if (dp->i_number == ino) return (EISDIR); if (dd_ino != NULL) return (0); if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); error = ufs_delete_denied(vdp, tdp, cred, curthread); if (error) { vput(tdp); return (error); } #ifdef SunOS_doesnt_do_that /* * The only purpose of this check is to return the correct * error. Assume that we want to rename directory "a" * to a file "b", and that we have no ACL_WRITE_DATA on * a containing directory, but we _do_ have ACL_APPEND_DATA. * In that case, the VOP_ACCESS check above will return 0, * and the operation will fail with ENOTDIR instead * of EACCESS. */ if (tdp->v_type == VDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, curthread); else error = VOP_ACCESS(vdp, VWRITE, cred, curthread); if (error) { vput(tdp); return (error); } #endif *vpp = tdp; - cnp->cn_flags |= SAVENAME; return (0); } if (dd_ino != NULL) return (0); /* * Step through the translation in the name. We do not `vput' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking * the directory to insure that the inode will not be removed * before we get it. We prevent deadlock by always fetching * inodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. * There is a potential race condition here if both the current * and parent directories are removed before the VFS_VGET for the * inode associated with ".." returns. We hope that this occurs * infrequently since we cannot avoid this race condition without * implementing a sophisticated deadlock detection algorithm. * Note also that this simple deadlock detection scheme will not * work if the filesystem has any hard links other than ".." * that point backwards in the directory structure. */ pdp = vdp; if (flags & ISDOTDOT) { error = vn_vget_ino(pdp, ino, cnp->cn_lkflags, &tdp); if (error) return (error); /* * Recheck that ".." entry in the vdp directory points * to the inode we looked up before vdp lock was * dropped. */ error = ufs_lookup_ino(pdp, NULL, cnp, &ino1); if (error) { vput(tdp); return (error); } if (ino1 != ino) { vput(tdp); goto restart; } *vpp = tdp; } else if (dp->i_number == ino) { VREF(vdp); /* we want ourself, ie "." */ /* * When we lookup "." we still can be asked to lock it * differently. */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(vdp)) { if (ltype == LK_EXCLUSIVE) vn_lock(vdp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(vdp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case may left us with * reclaimed vnode. */ if (VN_IS_DOOMED(vdp)) { vrele(vdp); return (ENOENT); } } *vpp = vdp; } else { error = VFS_VGET(pdp->v_mount, ino, cnp->cn_lkflags, &tdp); if (error == 0 && VTOI(tdp)->i_mode == 0) { vgone(tdp); vput(tdp); error = ENOENT; } if (error) return (error); *vpp = tdp; } /* * Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); } void ufs_dirbad(struct inode *ip, doff_t offset, char *how) { struct mount *mp; mp = ITOV(ip)->v_mount; if ((mp->mnt_flag & MNT_RDONLY) == 0) panic("ufs_dirbad: %s: bad dir ino %ju at offset %ld: %s", mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number, (long)offset, how); else (void)printf("%s: bad dir ino %ju at offset %ld: %s\n", mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number, (long)offset, how); } /* * Do consistency checking on a directory entry: * record length must be multiple of 4 * entry must fit in rest of its DIRBLKSIZ block * record must be large enough to contain entry * name is not longer than UFS_MAXNAMLEN * name must be as long as advertised, and null terminated */ int ufs_dirbadentry(struct vnode *dp, struct direct *ep, int entryoffsetinblock) { int i, namlen; # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(dp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if ((ep->d_reclen & 0x3) != 0 || ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || ep->d_reclen < DIRSIZ(OFSFMT(dp), ep) || namlen > UFS_MAXNAMLEN) { /*return (1); */ printf("First bad\n"); goto bad; } if (ep->d_ino == 0) return (0); for (i = 0; i < namlen; i++) if (ep->d_name[i] == '\0') { /*return (1); */ printf("Second bad\n"); goto bad; } if (ep->d_name[i]) goto bad; return (0); bad: return (1); } /* * Construct a new directory entry after a call to namei, using the * parameters that it left in the componentname argument cnp. The * argument ip is the inode to which the new directory entry will refer. */ void ufs_makedirentry(struct inode *ip, struct componentname *cnp, struct direct *newdirp) { u_int namelen; namelen = (unsigned)cnp->cn_namelen; - KASSERT((cnp->cn_flags & SAVENAME) != 0, - ("ufs_makedirentry: missing name")); KASSERT(namelen <= UFS_MAXNAMLEN, ("ufs_makedirentry: name too long")); newdirp->d_ino = ip->i_number; newdirp->d_namlen = namelen; /* Zero out after-name padding */ *(u_int32_t *)(&newdirp->d_name[namelen & ~(DIR_ROUNDUP - 1)]) = 0; bcopy(cnp->cn_nameptr, newdirp->d_name, namelen); if (!OFSFMT(ITOV(ip))) newdirp->d_type = IFTODT(ip->i_mode); else { newdirp->d_type = 0; # if (BYTE_ORDER == LITTLE_ENDIAN) { u_char tmp = newdirp->d_namlen; newdirp->d_namlen = newdirp->d_type; newdirp->d_type = tmp; } # endif } } /* * Write a directory entry after a call to namei, using the parameters * that it left in nameidata. The argument dirp is the new directory * entry contents. Dvp is a pointer to the directory to be written, * which was left locked by namei. Remaining parameters (dp->i_offset, * dp->i_count) indicate how the space for the new entry is to be obtained. * Non-null bp indicates that a directory is being created (for the * soft dependency code). */ int ufs_direnter(struct vnode *dvp, struct vnode *tvp, struct direct *dirp, struct componentname *cnp, struct buf *newdirbp) { struct ucred *cr; struct thread *td; int newentrysize; struct inode *dp; struct buf *bp; u_int dsize; struct direct *ep, *nep; u_int64_t old_isize; int error, ret, blkoff, loc, spacefree, flags, namlen; char *dirbuf; td = curthread; /* XXX */ cr = td->td_ucred; dp = VTOI(dvp); newentrysize = DIRSIZ(OFSFMT(dvp), dirp); if (I_COUNT(dp) == 0) { /* * If dp->i_count is 0, then namei could find no * space in the directory. Here, dp->i_offset will * be on a directory block boundary and we will write the * new entry into a fresh block. */ if (I_OFFSET(dp) & (DIRBLKSIZ - 1)) panic("ufs_direnter: newblk"); flags = BA_CLRBUF; if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)) flags |= IO_SYNC; #ifdef QUOTA if ((error = getinoquota(dp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); return (error); } #endif old_isize = dp->i_size; vnode_pager_setsize(dvp, (u_long)I_OFFSET(dp) + DIRBLKSIZ); if ((error = UFS_BALLOC(dvp, (off_t)I_OFFSET(dp), DIRBLKSIZ, cr, flags, &bp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); vnode_pager_setsize(dvp, (u_long)old_isize); return (error); } dp->i_size = I_OFFSET(dp) + DIRBLKSIZ; DIP_SET(dp, i_size, dp->i_size); SET_I_ENDOFF(dp, dp->i_size); UFS_INODE_SET_FLAG(dp, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); dirp->d_reclen = DIRBLKSIZ; blkoff = I_OFFSET(dp) & (VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1); bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) { ufsdirhash_newblk(dp, I_OFFSET(dp)); ufsdirhash_add(dp, dirp, I_OFFSET(dp)); ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff, I_OFFSET(dp)); } #endif if (DOINGSOFTDEP(dvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff += DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } if (softdep_setup_directory_add(bp, dp, I_OFFSET(dp), dirp->d_ino, newdirbp, 1)) UFS_INODE_SET_FLAG(dp, IN_NEEDSYNC); if (newdirbp) bdwrite(newdirbp); bdwrite(bp); return (UFS_UPDATE(dvp, 0)); } if (DOINGASYNC(dvp)) { bdwrite(bp); return (UFS_UPDATE(dvp, 0)); } error = bwrite(bp); ret = UFS_UPDATE(dvp, 1); if (error == 0) return (ret); return (error); } /* * If dp->i_count is non-zero, then namei found space for the new * entry in the range dp->i_offset to dp->i_offset + dp->i_count * in the directory. To use this space, we may have to compact * the entries located there, by copying them together towards the * beginning of the block, leaving the free space in one usable * chunk at the end. */ /* * Increase size of directory if entry eats into new space. * This should never push the size past a new multiple of * DIRBLKSIZE. * * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. */ if (I_OFFSET(dp) + I_COUNT(dp) > dp->i_size) { dp->i_size = I_OFFSET(dp) + I_COUNT(dp); DIP_SET(dp, i_size, dp->i_size); UFS_INODE_SET_FLAG(dp, IN_SIZEMOD | IN_MODIFIED); } /* * Get the block containing the space for the new directory entry. */ error = UFS_BLKATOFF(dvp, (off_t)I_OFFSET(dp), &dirbuf, &bp); if (error) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); return (error); } /* * Find space for the new entry. In the simple case, the entry at * offset base will have the space. If it does not, then namei * arranged that compacting the region dp->i_offset to * dp->i_offset + dp->i_count would yield the space. */ ep = (struct direct *)dirbuf; dsize = ep->d_ino ? DIRSIZ(OFSFMT(dvp), ep) : 0; spacefree = ep->d_reclen - dsize; for (loc = ep->d_reclen; loc < I_COUNT(dp); ) { nep = (struct direct *)(dirbuf + loc); /* Trim the existing slot (NB: dsize may be zero). */ ep->d_reclen = dsize; ep = (struct direct *)((char *)ep + dsize); /* Read nep->d_reclen now as the bcopy() may clobber it. */ loc += nep->d_reclen; if (nep->d_ino == 0) { /* * A mid-block unused entry. Such entries are * never created by the kernel, but fsck_ffs * can create them (and it doesn't fix them). * * Add up the free space, and initialise the * relocated entry since we don't bcopy it. */ spacefree += nep->d_reclen; ep->d_ino = 0; dsize = 0; continue; } dsize = DIRSIZ(OFSFMT(dvp), nep); spacefree += nep->d_reclen - dsize; #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_move(dp, nep, I_OFFSET(dp) + ((char *)nep - dirbuf), I_OFFSET(dp) + ((char *)ep - dirbuf)); #endif if (DOINGSOFTDEP(dvp)) softdep_change_directoryentry_offset(bp, dp, dirbuf, (caddr_t)nep, (caddr_t)ep, dsize); else bcopy((caddr_t)nep, (caddr_t)ep, dsize); } /* * Here, `ep' points to a directory entry containing `dsize' in-use * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0, * then the entry is completely unused (dsize == 0). The value * of ep->d_reclen is always indeterminate. * * Update the pointer fields in the previous entry (if any), * copy in the new entry, and write out the block. */ # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(dvp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if (ep->d_ino == 0 || (ep->d_ino == UFS_WINO && namlen == dirp->d_namlen && bcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) { if (spacefree + dsize < newentrysize) panic("ufs_direnter: compact1"); dirp->d_reclen = spacefree + dsize; } else { if (spacefree < newentrysize) panic("ufs_direnter: compact2"); dirp->d_reclen = spacefree; ep->d_reclen = dsize; ep = (struct direct *)((char *)ep + dsize); } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL && (ep->d_ino == 0 || dirp->d_reclen == spacefree)) ufsdirhash_add(dp, dirp, I_OFFSET(dp) + ((char *)ep - dirbuf)); #endif bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, dirbuf - (I_OFFSET(dp) & (DIRBLKSIZ - 1)), rounddown2(I_OFFSET(dp), DIRBLKSIZ)); #endif if (DOINGSOFTDEP(dvp)) { (void) softdep_setup_directory_add(bp, dp, I_OFFSET(dp) + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp, 0); if (newdirbp != NULL) bdwrite(newdirbp); bdwrite(bp); } else { if (DOINGASYNC(dvp)) { bdwrite(bp); error = 0; } else { error = bwrite(bp); } } /* * If all went well, and the directory can be shortened, * mark directory inode with the truncation request. */ UFS_INODE_SET_FLAG(dp, IN_CHANGE | IN_UPDATE | (error == 0 && I_ENDOFF(dp) != 0 && I_ENDOFF(dp) < dp->i_size ? IN_ENDOFF : 0)); return (error); } /* * Remove a directory entry after a call to namei, using * the parameters which it left in nameidata. The entry * dp->i_offset contains the offset into the directory of the * entry to be eliminated. The dp->i_count field contains the * size of the previous record in the directory. If this * is 0, the first entry is being deleted, so we need only * zero the inode number to mark the entry as free. If the * entry is not the first in the directory, we must reclaim * the space of the now empty record by adding the record size * to the size of the previous entry. */ int ufs_dirremove(struct vnode *dvp, struct inode *ip, int flags, int isrmdir) { struct inode *dp; struct direct *ep, *rep; struct buf *bp; off_t offset; int error; dp = VTOI(dvp); /* * Adjust the link count early so softdep can block if necessary. */ if (ip) { ip->i_effnlink--; UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(dvp)) { softdep_setup_unlink(dp, ip); } else { ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); } } if (flags & DOWHITEOUT) offset = I_OFFSET(dp); else offset = I_OFFSET(dp) - I_COUNT(dp); if ((error = UFS_BLKATOFF(dvp, offset, (char **)&ep, &bp)) != 0) { if (ip) { ip->i_effnlink++; UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(dvp)) { softdep_change_linkcnt(ip); } else { ip->i_nlink++; DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); } } return (error); } if (flags & DOWHITEOUT) { /* * Whiteout entry: set d_ino to UFS_WINO. */ ep->d_ino = UFS_WINO; ep->d_type = DT_WHT; goto out; } /* Set 'rep' to the entry being removed. */ if (I_COUNT(dp) == 0) rep = ep; else rep = (struct direct *)((char *)ep + ep->d_reclen); #ifdef UFS_DIRHASH /* * Remove the dirhash entry. This is complicated by the fact * that `ep' is the previous entry when dp->i_count != 0. */ if (dp->i_dirhash != NULL) ufsdirhash_remove(dp, rep, I_OFFSET(dp)); #endif if (ip && rep->d_ino != ip->i_number) panic("ufs_dirremove: ip %ju does not match dirent ino %ju\n", (uintmax_t)ip->i_number, (uintmax_t)rep->d_ino); /* * Zero out the file directory entry metadata to reduce disk * scavenging disclosure. */ bzero(&rep->d_name[0], rep->d_namlen); rep->d_namlen = 0; rep->d_type = 0; rep->d_ino = 0; if (I_COUNT(dp) != 0) { /* * Collapse new free space into previous entry. */ ep->d_reclen += rep->d_reclen; rep->d_reclen = 0; } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, (char *)ep - ((I_OFFSET(dp) - I_COUNT(dp)) & (DIRBLKSIZ - 1)), rounddown2(I_OFFSET(dp), DIRBLKSIZ)); #endif out: error = 0; if (DOINGSOFTDEP(dvp)) { if (ip) softdep_setup_remove(bp, dp, ip, isrmdir); if (softdep_slowdown(dvp)) error = bwrite(bp); else bdwrite(bp); } else { if (flags & DOWHITEOUT) error = bwrite(bp); else if (DOINGASYNC(dvp)) bdwrite(bp); else error = bwrite(bp); } UFS_INODE_SET_FLAG(dp, IN_CHANGE | IN_UPDATE); /* * If the last named reference to a snapshot goes away, * drop its snapshot reference so that it will be reclaimed * when last open reference goes away. */ if (ip != NULL && IS_SNAPSHOT(ip) && ip->i_effnlink == 0) UFS_SNAPGONE(ip); return (error); } /* * Rewrite an existing directory entry to point at the inode * supplied. The parameters describing the directory entry are * set up by a call to namei. */ int ufs_dirrewrite(struct inode *dp, struct inode *oip, ino_t newinum, int newtype, int isrmdir) { struct buf *bp; struct direct *ep; struct vnode *vdp = ITOV(dp); int error; /* * Drop the link before we lock the buf so softdep can block if * necessary. */ oip->i_effnlink--; UFS_INODE_SET_FLAG(oip, IN_CHANGE); if (DOINGSOFTDEP(vdp)) { softdep_setup_unlink(dp, oip); } else { oip->i_nlink--; DIP_SET(oip, i_nlink, oip->i_nlink); UFS_INODE_SET_FLAG(oip, IN_CHANGE); } error = UFS_BLKATOFF(vdp, (off_t)I_OFFSET(dp), (char **)&ep, &bp); if (error == 0 && ep->d_namlen == 2 && ep->d_name[1] == '.' && ep->d_name[0] == '.' && ep->d_ino != oip->i_number) { brelse(bp); error = EIDRM; } if (error) { oip->i_effnlink++; UFS_INODE_SET_FLAG(oip, IN_CHANGE); if (DOINGSOFTDEP(vdp)) { softdep_change_linkcnt(oip); } else { oip->i_nlink++; DIP_SET(oip, i_nlink, oip->i_nlink); UFS_INODE_SET_FLAG(oip, IN_CHANGE); } return (error); } ep->d_ino = newinum; if (!OFSFMT(vdp)) ep->d_type = newtype; if (DOINGSOFTDEP(vdp)) { softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir); bdwrite(bp); } else { if (DOINGASYNC(vdp)) { bdwrite(bp); error = 0; } else { error = bwrite(bp); } } UFS_INODE_SET_FLAG(dp, IN_CHANGE | IN_UPDATE); /* * If the last named reference to a snapshot goes away, * drop its snapshot reference so that it will be reclaimed * when last open reference goes away. */ if (IS_SNAPSHOT(oip) && oip->i_effnlink == 0) UFS_SNAPGONE(oip); return (error); } /* * Check if a directory is empty or not. * Inode supplied must be locked. * * Using a struct dirtemplate here is not precisely * what we want, but better than using a struct direct. * * NB: does not handle corrupted directories. */ int ufs_dirempty(struct inode *ip, ino_t parentino, struct ucred *cred) { doff_t off; struct dirtemplate dbuf; struct direct *dp = (struct direct *)&dbuf; int error, namlen; ssize_t count; #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) for (off = 0; off < ip->i_size; off += dp->d_reclen) { error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, &count, (struct thread *)0); /* * Since we read MINDIRSIZ, residual must * be 0 unless we're at end of file. */ if (error || count != 0) return (0); /* avoid infinite loops */ if (dp->d_reclen == 0) return (0); /* skip empty entries */ if (dp->d_ino == 0 || dp->d_ino == UFS_WINO) continue; /* accept only "." and ".." */ # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(ITOV(ip))) namlen = dp->d_type; else namlen = dp->d_namlen; # else namlen = dp->d_namlen; # endif if (namlen > 2) return (0); if (dp->d_name[0] != '.') return (0); /* * At this point namlen must be 1 or 2. * 1 implies ".", 2 implies ".." if second * char is also "." */ if (namlen == 1 && dp->d_ino == ip->i_number) continue; if (dp->d_name[1] == '.' && dp->d_ino == parentino) continue; return (0); } return (1); } static int ufs_dir_dd_ino(struct vnode *vp, struct ucred *cred, ino_t *dd_ino, struct vnode **dd_vp) { struct dirtemplate dirbuf; struct vnode *ddvp; int error, namlen; ASSERT_VOP_LOCKED(vp, "ufs_dir_dd_ino"); *dd_vp = NULL; if (vp->v_type != VDIR) return (ENOTDIR); /* * First check to see if we have it in the name cache. */ if ((ddvp = vn_dir_dd_ino(vp)) != NULL) { KASSERT(ddvp->v_mount == vp->v_mount, ("ufs_dir_dd_ino: Unexpected mount point crossing")); *dd_ino = VTOI(ddvp)->i_number; *dd_vp = ddvp; return (0); } /* * Have to read the directory. */ error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, NULL, NULL); if (error != 0) return (error); #if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(vp)) namlen = dirbuf.dotdot_type; else namlen = dirbuf.dotdot_namlen; #else namlen = dirbuf.dotdot_namlen; #endif if (namlen != 2 || dirbuf.dotdot_name[0] != '.' || dirbuf.dotdot_name[1] != '.') return (ENOTDIR); *dd_ino = dirbuf.dotdot_ino; return (0); } /* * Check if source directory is in the path of the target directory. */ int ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino) { struct mount *mp; struct vnode *tvp, *vp, *vp1; int error; ino_t dd_ino; vp = tvp = ITOV(target); mp = vp->v_mount; *wait_ino = 0; sx_assert(&VFSTOUFS(mp)->um_checkpath_lock, SA_XLOCKED); if (target->i_number == source_ino) return (EEXIST); if (target->i_number == parent_ino) return (0); if (target->i_number == UFS_ROOTINO) return (0); for (;;) { error = ufs_dir_dd_ino(vp, cred, &dd_ino, &vp1); if (error != 0) break; if (dd_ino == source_ino) { error = EINVAL; break; } if (dd_ino == UFS_ROOTINO) break; if (dd_ino == parent_ino) break; if (vp1 == NULL) { error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, &vp1); if (error != 0) { *wait_ino = dd_ino; break; } } KASSERT(dd_ino == VTOI(vp1)->i_number, ("directory %ju reparented\n", (uintmax_t)VTOI(vp1)->i_number)); if (vp != tvp) vput(vp); vp = vp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (vp1 != NULL) vput(vp1); if (vp != tvp) vput(vp); return (error); } #ifdef DIAGNOSTIC static void ufs_assert_inode_offset_owner(struct inode *ip, struct iown_tracker *tr, const char *name, const char *file, int line) { char msg[128]; snprintf(msg, sizeof(msg), "at %s@%d", file, line); ASSERT_VOP_ELOCKED(ITOV(ip), msg); MPASS((ip->i_mode & IFMT) == IFDIR); if (curthread == tr->tr_owner && ip->i_lock_gen == tr->tr_gen) return; printf("locked at\n"); stack_print(&tr->tr_st); printf("unlocked at\n"); stack_print(&tr->tr_unlock); panic("%s ip %p %jd offset owner %p %d gen %d " "curthread %p %d gen %d at %s@%d\n", name, ip, (uintmax_t)ip->i_number, tr->tr_owner, tr->tr_owner->td_tid, tr->tr_gen, curthread, curthread->td_tid, ip->i_lock_gen, file, line); } static void ufs_set_inode_offset_owner(struct inode *ip, struct iown_tracker *tr, const char *file, int line) { char msg[128]; snprintf(msg, sizeof(msg), "at %s@%d", file, line); ASSERT_VOP_ELOCKED(ITOV(ip), msg); MPASS((ip->i_mode & IFMT) == IFDIR); tr->tr_owner = curthread; tr->tr_gen = ip->i_lock_gen; stack_save(&tr->tr_st); } static void ufs_init_one_tracker(struct iown_tracker *tr) { tr->tr_owner = NULL; stack_zero(&tr->tr_st); } void ufs_init_trackers(struct inode *ip) { ufs_init_one_tracker(&ip->i_offset_tracker); ufs_init_one_tracker(&ip->i_count_tracker); ufs_init_one_tracker(&ip->i_endoff_tracker); } void ufs_unlock_tracker(struct inode *ip) { if (ip->i_count_tracker.tr_gen == ip->i_lock_gen) stack_save(&ip->i_count_tracker.tr_unlock); if (ip->i_offset_tracker.tr_gen == ip->i_lock_gen) stack_save(&ip->i_offset_tracker.tr_unlock); if (ip->i_endoff_tracker.tr_gen == ip->i_lock_gen) stack_save(&ip->i_endoff_tracker.tr_unlock); ip->i_lock_gen++; } doff_t ufs_get_i_offset(struct inode *ip, const char *file, int line) { ufs_assert_inode_offset_owner(ip, &ip->i_offset_tracker, "i_offset", file, line); return (ip->i_offset); } void ufs_set_i_offset(struct inode *ip, doff_t off, const char *file, int line) { ufs_set_inode_offset_owner(ip, &ip->i_offset_tracker, file, line); ip->i_offset = off; } int32_t ufs_get_i_count(struct inode *ip, const char *file, int line) { ufs_assert_inode_offset_owner(ip, &ip->i_count_tracker, "i_count", file, line); return (ip->i_count); } void ufs_set_i_count(struct inode *ip, int32_t cnt, const char *file, int line) { ufs_set_inode_offset_owner(ip, &ip->i_count_tracker, file, line); ip->i_count = cnt; } doff_t ufs_get_i_endoff(struct inode *ip, const char *file, int line) { ufs_assert_inode_offset_owner(ip, &ip->i_endoff_tracker, "i_endoff", file, line); return (ip->i_endoff); } void ufs_set_i_endoff(struct inode *ip, doff_t off, const char *file, int line) { ufs_set_inode_offset_owner(ip, &ip->i_endoff_tracker, file, line); ip->i_endoff = off; } #endif diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index af01fb37fe53..cf101ae50750 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1,3079 +1,3059 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_suiddir.h" #include "opt_ufs.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #ifdef UFS_GJOURNAL #include FEATURE(ufs_gjournal, "Journaling support through GEOM for UFS"); #endif #ifdef QUOTA FEATURE(ufs_quota, "UFS disk quotas support"); FEATURE(ufs_quota64, "64bit UFS disk quotas support"); #endif #ifdef SUIDDIR FEATURE(suiddir, "Give all new files in directory the same ownership as the directory"); #endif VFS_SMR_DECLARE; #include static vop_accessx_t ufs_accessx; static vop_fplookup_vexec_t ufs_fplookup_vexec; static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ufs_close; static vop_create_t ufs_create; static vop_stat_t ufs_stat; static vop_getattr_t ufs_getattr; static vop_ioctl_t ufs_ioctl; static vop_link_t ufs_link; static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *, const char *); static vop_mmapped_t ufs_mmapped; static vop_mkdir_t ufs_mkdir; static vop_mknod_t ufs_mknod; static vop_open_t ufs_open; static vop_pathconf_t ufs_pathconf; static vop_print_t ufs_print; static vop_readlink_t ufs_readlink; static vop_remove_t ufs_remove; static vop_rename_t ufs_rename; static vop_rmdir_t ufs_rmdir; static vop_setattr_t ufs_setattr; static vop_strategy_t ufs_strategy; static vop_symlink_t ufs_symlink; static vop_whiteout_t ufs_whiteout; static vop_close_t ufsfifo_close; SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "UFS filesystem"); /* * A virgin directory (no blushing please). */ static struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." }; static struct odirtemplate omastertemplate = { 0, 12, 1, ".", 0, DIRBLKSIZ - 12, 2, ".." }; static void ufs_itimes_locked(struct vnode *vp) { struct inode *ip; struct timespec ts; ASSERT_VI_LOCKED(vp, __func__); ip = VTOI(vp); if (UFS_RDONLY(ip)) goto out; if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp)) UFS_INODE_SET_FLAG(ip, IN_LAZYMOD); else if (((vp->v_mount->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) || (ip->i_flag & (IN_CHANGE | IN_UPDATE))) UFS_INODE_SET_FLAG(ip, IN_MODIFIED); else if (ip->i_flag & IN_ACCESS) UFS_INODE_SET_FLAG(ip, IN_LAZYACCESS); vfs_timestamp(&ts); if (ip->i_flag & IN_ACCESS) { DIP_SET(ip, i_atime, ts.tv_sec); DIP_SET(ip, i_atimensec, ts.tv_nsec); } if (ip->i_flag & IN_UPDATE) { DIP_SET(ip, i_mtime, ts.tv_sec); DIP_SET(ip, i_mtimensec, ts.tv_nsec); } if (ip->i_flag & IN_CHANGE) { DIP_SET(ip, i_ctime, ts.tv_sec); DIP_SET(ip, i_ctimensec, ts.tv_nsec); DIP_SET(ip, i_modrev, DIP(ip, i_modrev) + 1); } out: ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } void ufs_itimes(struct vnode *vp) { struct inode *ip; ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; VI_LOCK(vp); ufs_itimes_locked(vp); VI_UNLOCK(vp); } static int ufs_sync_nlink1(struct mount *mp) { int error; error = vfs_busy(mp, 0); if (error == 0) { VFS_SYNC(mp, MNT_WAIT); vfs_unbusy(mp); error = ERELOOKUP; } vfs_rel(mp); return (error); } static int ufs_sync_nlink(struct vnode *vp, struct vnode *vp1) { struct inode *ip; struct mount *mp; int error; ip = VTOI(vp); if (ip->i_nlink < UFS_LINK_MAX) return (0); if (!DOINGSOFTDEP(vp) || ip->i_effnlink >= UFS_LINK_MAX) return (EMLINK); mp = vp->v_mount; vfs_ref(mp); VOP_UNLOCK(vp); if (vp1 != NULL) VOP_UNLOCK(vp1); error = ufs_sync_nlink1(mp); vn_lock_pair(vp, false, vp1, false); return (error); } /* * Create a regular file */ static int ufs_create( struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap) { int error; error = ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp, "ufs_create"); if (error != 0) return (error); if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ static int ufs_mknod( struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap) { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp, "ufs_mknod"); if (error) return (error); ip = VTOI(*vpp); UFS_INODE_SET_FLAG(ip, IN_ACCESS | IN_CHANGE | IN_UPDATE); if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ DIP_SET(ip, i_rdev, vap->va_rdev); } /* * Remove inode, then reload it through VFS_VGET(). This is * needed to do further inode initialization, for instance * fifo, which was too early for VFS_VGET() done as part of * UFS_VALLOC(). */ (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); vput(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { *vpp = NULL; return (error); } return (0); } /* * Open called. */ /* ARGSUSED */ static int ufs_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); ip = VTOI(vp); vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td); if (vp->v_type == VREG && (vn_irflag_read(vp) & VIRF_PGREAD) == 0 && ip->i_ump->um_bsize >= PAGE_SIZE) { vn_irflag_set_cond(vp, VIRF_PGREAD); } /* * Files marked append-only must be opened for appending. */ if ((ip->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int ufs_close( struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap) { struct vnode *vp = ap->a_vp; ufs_itimes(vp); return (0); } static int ufs_accessx( struct vop_accessx_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; #ifdef UFS_ACL struct acl *acl; acl_type_t type; #endif /* * Disallow write attempts on read-only filesystems; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (accmode & VMODIFY_PERMS) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA /* * Inode is accounted in the quotas only if struct * dquot is attached to it. VOP_ACCESS() is called * from vn_open_cred() and provides a convenient * point to call getinoquota(). The lock mode is * exclusive when the file is opening for write. */ if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { error = getinoquota(ip); if (error != 0) return (error); } #endif break; default: break; } } /* * If immutable bit set, nobody gets to write it. "& ~VADMIN_PERMS" * permits the owner of the file to remove the IMMUTABLE flag. */ if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) return (EPERM); #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) != 0) { if (vp->v_mount->mnt_flag & MNT_NFS4ACLS) type = ACL_TYPE_NFS4; else type = ACL_TYPE_ACCESS; acl = acl_alloc(M_WAITOK); if (type == ACL_TYPE_NFS4) error = ufs_getacl_nfs4_internal(vp, acl, ap->a_td); else error = VOP_GETACL(vp, type, acl, ap->a_cred, ap->a_td); switch (error) { case 0: if (type == ACL_TYPE_NFS4) { error = vaccess_acl_nfs4(vp->v_type, ip->i_uid, ip->i_gid, acl, accmode, ap->a_cred); } else { error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess_acl_posix1e(vp->v_type, ip->i_uid, ip->i_gid, acl, accmode, ap->a_cred); } break; default: if (error != EOPNOTSUPP) printf( "ufs_accessx(): Error retrieving ACL on object (%d).\n", error); /* * XXX: Fall back until debugged. Should * eventually possibly log an error, and return * EPERM for safety. */ error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, accmode, ap->a_cred); } acl_free(acl); return (error); } #endif /* !UFS_ACL */ error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, accmode, ap->a_cred); return (error); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ static int ufs_fplookup_vexec( struct vop_fplookup_vexec_args /* { struct vnode *a_vp; struct ucred *a_cred; struct thread *a_td; } */ *ap) { struct vnode *vp; struct inode *ip; struct ucred *cred; mode_t all_x, mode; vp = ap->a_vp; ip = VTOI_SMR(vp); if (__predict_false(ip == NULL)) return (EAGAIN); /* * XXX ACL race * * ACLs are not supported and UFS clears/sets this flag on mount and * remount. However, we may still be racing with seeing them and there * is no provision to make sure they were accounted for. This matches * the behavior of the locked case, since the lookup there is also * racy: mount takes no measures to block anyone from progressing. */ all_x = S_IXUSR | S_IXGRP | S_IXOTH; mode = atomic_load_short(&ip->i_mode); if (__predict_true((mode & all_x) == all_x)) return (0); cred = ap->a_cred; return (vaccess_vexec_smr(mode, ip->i_uid, ip->i_gid, cred)); } /* ARGSUSED */ static int ufs_stat(struct vop_stat_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct stat *sb = ap->a_sb; int error; error = vop_stat_helper_pre(ap); if (__predict_false(error)) return (error); VI_LOCK(vp); ufs_itimes_locked(vp); if (I_IS_UFS1(ip)) { sb->st_atim.tv_sec = ip->i_din1->di_atime; sb->st_atim.tv_nsec = ip->i_din1->di_atimensec; } else { sb->st_atim.tv_sec = ip->i_din2->di_atime; sb->st_atim.tv_nsec = ip->i_din2->di_atimensec; } VI_UNLOCK(vp); sb->st_dev = dev2udev(ITOUMP(ip)->um_dev); sb->st_ino = ip->i_number; sb->st_mode = (ip->i_mode & ~IFMT) | VTTOIF(vp->v_type); sb->st_nlink = ip->i_effnlink; sb->st_uid = ip->i_uid; sb->st_gid = ip->i_gid; if (I_IS_UFS1(ip)) { sb->st_rdev = ip->i_din1->di_rdev; sb->st_size = ip->i_din1->di_size; sb->st_mtim.tv_sec = ip->i_din1->di_mtime; sb->st_mtim.tv_nsec = ip->i_din1->di_mtimensec; sb->st_ctim.tv_sec = ip->i_din1->di_ctime; sb->st_ctim.tv_nsec = ip->i_din1->di_ctimensec; sb->st_birthtim.tv_sec = -1; sb->st_birthtim.tv_nsec = 0; sb->st_blocks = dbtob((u_quad_t)ip->i_din1->di_blocks) / S_BLKSIZE; } else { sb->st_rdev = ip->i_din2->di_rdev; sb->st_size = ip->i_din2->di_size; sb->st_mtim.tv_sec = ip->i_din2->di_mtime; sb->st_mtim.tv_nsec = ip->i_din2->di_mtimensec; sb->st_ctim.tv_sec = ip->i_din2->di_ctime; sb->st_ctim.tv_nsec = ip->i_din2->di_ctimensec; sb->st_birthtim.tv_sec = ip->i_din2->di_birthtime; sb->st_birthtim.tv_nsec = ip->i_din2->di_birthnsec; sb->st_blocks = dbtob((u_quad_t)ip->i_din2->di_blocks) / S_BLKSIZE; } sb->st_blksize = max(PAGE_SIZE, vp->v_mount->mnt_stat.f_iosize); sb->st_flags = ip->i_flags; sb->st_gen = ip->i_gen; return (vop_stat_helper_post(ap, error)); } /* ARGSUSED */ static int ufs_getattr( struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct vattr *vap = ap->a_vap; VI_LOCK(vp); ufs_itimes_locked(vp); if (I_IS_UFS1(ip)) { vap->va_atime.tv_sec = ip->i_din1->di_atime; vap->va_atime.tv_nsec = ip->i_din1->di_atimensec; } else { vap->va_atime.tv_sec = ip->i_din2->di_atime; vap->va_atime.tv_nsec = ip->i_din2->di_atimensec; } VI_UNLOCK(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ITOUMP(ip)->um_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_effnlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; if (I_IS_UFS1(ip)) { vap->va_rdev = ip->i_din1->di_rdev; vap->va_size = ip->i_din1->di_size; vap->va_mtime.tv_sec = ip->i_din1->di_mtime; vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec; vap->va_ctime.tv_sec = ip->i_din1->di_ctime; vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec; vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks); vap->va_filerev = ip->i_din1->di_modrev; } else { vap->va_rdev = ip->i_din2->di_rdev; vap->va_size = ip->i_din2->di_size; vap->va_mtime.tv_sec = ip->i_din2->di_mtime; vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec; vap->va_ctime.tv_sec = ip->i_din2->di_ctime; vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec; vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime; vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec; vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks); vap->va_filerev = ip->i_din2->di_modrev; } vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_type = IFTOVT(ip->i_mode); return (0); } /* * Set attribute vnode op. called from several syscalls */ static int ufs_setattr( struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap) { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct thread *td = curthread; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { if ((vap->va_flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | SF_SNAPSHOT | UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | UF_SPARSE | UF_SYSTEM)) != 0) return (EOPNOTSUPP); if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the PR_ALLOW_CHFLAGS permission bit is set; * otherwise, they behave like unprivileged processes. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } /* The snapshot flag cannot be toggled. */ if ((vap->va_flags ^ ip->i_flags) & SF_SNAPSHOT) return (EPERM); } else { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE)) return (EPERM); } ip->i_flags = vap->va_flags; DIP_SET(ip, i_flags, vap->va_flags); UFS_INODE_SET_FLAG(ip, IN_CHANGE); error = UFS_UPDATE(vp, 0); if (ip->i_flags & (IMMUTABLE | APPEND)) return (error); } /* * If immutable or append, no one can change any of its attributes * except the ones already handled (in some cases, file flags * including the immutability flags themselves for the superuser). */ if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, td)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * XXX most of the following special cases should be in * callers instead of in N filesystems. The VDIR check * mostly already is. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: /* * Truncation should have an effect in these cases. * Disallow it if the filesystem is read-only or * the file is being snapshotted. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (IS_SNAPSHOT(ip)) return (EPERM); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support shared memory objects in the file * system, and have dubious support for truncating * symlinks. Just ignore the request in other cases. */ return (0); } if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL | ((vap->va_vaflags & VA_SYNC) != 0 ? IO_SYNC : 0), cred)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (IS_SNAPSHOT(ip)) return (EPERM); error = vn_utimes_perm(vp, vap, cred, td); if (error != 0) return (error); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); if (vap->va_atime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_ACCESS; DIP_SET(ip, i_atime, vap->va_atime.tv_sec); DIP_SET(ip, i_atimensec, vap->va_atime.tv_nsec); } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_UPDATE; DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec); DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec); } if (vap->va_birthtime.tv_sec != VNOVAL && I_IS_UFS2(ip)) { ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec; ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec; } error = UFS_UPDATE(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (IS_SNAPSHOT(ip) && (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH)) != 0) return (EPERM); error = ufs_chmod(vp, (int)vap->va_mode, cred, td); } return (error); } #ifdef UFS_ACL static int ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode, int file_owner_id, struct ucred *cred, struct thread *td) { int error; struct acl *aclp; aclp = acl_alloc(M_WAITOK); error = ufs_getacl_nfs4_internal(vp, aclp, td); /* * We don't have to handle EOPNOTSUPP here, as the filesystem claims * it supports ACLs. */ if (error) goto out; acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id); error = ufs_setacl_nfs4_internal(vp, aclp, td); out: acl_free(aclp); return (error); } #endif /* UFS_ACL */ static int ufs_mmapped( struct vop_mmapped_args /* { struct vnode *a_vp; } */ *ap) { struct vnode *vp; struct inode *ip; struct mount *mp; vp = ap->a_vp; ip = VTOI(vp); mp = vp->v_mount; if ((mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS); /* * XXXKIB No UFS_UPDATE(ap->a_vp, 0) there. */ return (0); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td) { struct inode *ip = VTOI(vp); int newmode, error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. Both of these are allowed in * jail(8). */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return (EFTYPE); } if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error) return (error); } /* * Deny setting setuid if we are not the file owner. */ if ((mode & ISUID) && ip->i_uid != cred->cr_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN); if (error) return (error); } newmode = ip->i_mode & ~ALLPERMS; newmode |= (mode & ALLPERMS); UFS_INODE_SET_MODE(ip, newmode); DIP_SET(ip, i_mode, ip->i_mode); UFS_INODE_SET_FLAG(ip, IN_CHANGE); #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0) error = ufs_update_nfs4_acl_after_mode_change(vp, mode, ip->i_uid, cred, td); #endif if (error == 0 && (ip->i_flag & IN_CHANGE) != 0) error = UFS_UPDATE(vp, 0); return (error); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *td) { struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; #ifdef QUOTA int i; ufs2_daddr_t change; #endif if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if (((uid != ip->i_uid && uid != cred->cr_uid) || (gid != ip->i_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) return (error); ogid = ip->i_gid; ouid = ip->i_uid; #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } change = DIP(ip, i_blocks); (void) chkdq(ip, -change, cred, CHOWN|FORCE); (void) chkiq(ip, -1, cred, CHOWN|FORCE); for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } #endif ip->i_gid = gid; DIP_SET(ip, i_gid, gid); ip->i_uid = uid; DIP_SET(ip, i_uid, uid); #ifdef QUOTA if ((error = getinoquota(ip)) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) goto good; else (void) chkdq(ip, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } ip->i_gid = ogid; DIP_SET(ip, i_gid, ogid); ip->i_uid = ouid; DIP_SET(ip, i_uid, ouid); if (getinoquota(ip) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } (void) chkdq(ip, change, cred, FORCE|CHOWN); (void) chkiq(ip, 1, cred, FORCE|CHOWN); (void) getinoquota(ip); } return (error); good: if (getinoquota(ip)) panic("ufs_chown: lost quota"); #endif /* QUOTA */ UFS_INODE_SET_FLAG(ip, IN_CHANGE); if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID)); DIP_SET(ip, i_mode, ip->i_mode); } } error = UFS_UPDATE(vp, 0); return (error); } static int ufs_remove( struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap) { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; struct thread *td; td = curthread; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) return (EPERM); if (DOINGSUJ(dvp)) { error = softdep_prelink(dvp, vp, ap->a_cnp); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); if (ip->i_nlink <= 0) vp->v_vflag |= VV_NOSYNC; if (IS_SNAPSHOT(ip)) { /* * Avoid deadlock where another thread is trying to * update the inodeblock for dvp and is waiting on * snaplk. Temporary unlock the vnode lock for the * unlinked file and sync the directory. This should * allow vput() of the directory to not block later on * while holding the snapshot vnode locked, assuming * that the directory hasn't been unlinked too. */ VOP_UNLOCK(vp); (void) VOP_FSYNC(dvp, MNT_WAIT, td); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } return (error); } static void print_bad_link_count(const char *funcname, struct vnode *dvp) { struct inode *dip; dip = VTOI(dvp); uprintf("%s: Bad link count %d on parent inode %jd in file system %s\n", funcname, dip->i_effnlink, (intmax_t)dip->i_number, dvp->v_mount->mnt_stat.f_mntonname); } /* * link vnode call */ static int ufs_link( struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; struct direct newdir; int error; -#ifdef INVARIANTS - if ((cnp->cn_flags & HASBUF) == 0) - panic("ufs_link: no name"); -#endif - if (DOINGSUJ(tdvp)) { error = softdep_prelink(tdvp, vp, cnp); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } if (VTOI(tdvp)->i_effnlink < 2) { print_bad_link_count("ufs_link", tdvp); error = EINVAL; goto out; } error = ufs_sync_nlink(vp, tdvp); if (error != 0) goto out; ip = VTOI(vp); /* * The file may have been removed after namei dropped the original * lock. */ if (ip->i_effnlink == 0) { error = ENOENT; goto out; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } ip->i_effnlink++; ip->i_nlink++; DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(vp)) softdep_setup_link(VTOI(tdvp), ip); error = UFS_UPDATE(vp, !DOINGSOFTDEP(vp) && !DOINGASYNC(vp)); if (!error) { ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL); } if (error) { ip->i_effnlink--; ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(vp)) softdep_revert_link(VTOI(tdvp), ip); } out: return (error); } /* * whiteout vnode call */ static int ufs_whiteout( struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap) { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct newdir; int error = 0; if (DOINGSUJ(dvp) && (ap->a_flags == CREATE || ap->a_flags == DELETE)) { error = softdep_prelink(dvp, NULL, cnp); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ if (!OFSFMT(dvp)) return (0); return (EOPNOTSUPP); case CREATE: /* create a new directory whiteout */ #ifdef INVARIANTS - if ((cnp->cn_flags & SAVENAME) == 0) - panic("ufs_whiteout: missing name"); if (OFSFMT(dvp)) panic("ufs_whiteout: old format filesystem"); #endif newdir.d_ino = UFS_WINO; newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL); break; case DELETE: /* remove an existing directory whiteout */ #ifdef INVARIANTS if (OFSFMT(dvp)) panic("ufs_whiteout: old format filesystem"); #endif cnp->cn_flags &= ~DOWHITEOUT; error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0); break; default: panic("ufs_whiteout: unknown op"); } return (error); } static volatile int rename_restarts; SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD, __DEVOLATILE(int *, &rename_restarts), 0, "Times rename had to restart due to lock contention"); /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ static int ufs_rename( struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap) { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *nvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = curthread; struct inode *fip, *tip, *tdp, *fdp; struct direct newdir; off_t endoff; int doingdirectory, newparent; int error = 0; struct mount *mp; ino_t ino; seqc_t fdvp_s, fvp_s, tdvp_s, tvp_s; bool checkpath_locked, want_seqc_end; checkpath_locked = want_seqc_end = false; -#ifdef INVARIANTS - if ((tcnp->cn_flags & HASBUF) == 0 || - (fcnp->cn_flags & HASBUF) == 0) - panic("ufs_rename: no name"); -#endif endoff = 0; mp = tdvp->v_mount; VOP_UNLOCK(tdvp); if (tvp && tvp != tdvp) VOP_UNLOCK(tvp); /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; mp = NULL; goto releout; } fdvp_s = fvp_s = tdvp_s = tvp_s = SEQC_MOD; relock: /* * We need to acquire 2 to 4 locks depending on whether tvp is NULL * and fdvp and tdvp are the same directory. Subsequently we need * to double-check all paths and in the directory rename case we * need to verify that we are not creating a directory loop. To * handle this we acquire all but fdvp using non-blocking * acquisitions. If we fail to acquire any lock in the path we will * drop all held locks, acquire the new lock in a blocking fashion, * and then release it and restart the rename. This acquire/release * step ensures that we do not spin on a lock waiting for release. */ error = vn_lock(fdvp, LK_EXCLUSIVE); if (error) goto releout; if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(fdvp); error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto releout; VOP_UNLOCK(tdvp); atomic_add_int(&rename_restarts, 1); goto relock; } /* * Re-resolve fvp to be certain it still exists and fetch the * correct vnode. */ error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); if (error) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); goto releout; } error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (error) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); if (error != EBUSY) goto releout; error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; VOP_UNLOCK(nvp); vrele(fvp); fvp = nvp; atomic_add_int(&rename_restarts, 1); goto relock; } vrele(fvp); fvp = nvp; /* * Re-resolve tvp and acquire the vnode lock if present. */ error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino); if (error != 0 && error != EJUSTRETURN) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); VOP_UNLOCK(fvp); goto releout; } /* * If tvp disappeared we just carry on. */ if (error == EJUSTRETURN && tvp != NULL) { vrele(tvp); tvp = NULL; } /* * Get the tvp ino if the lookup succeeded. We may have to restart * if the non-blocking acquire fails. */ if (error == 0) { nvp = NULL; error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (tvp) vrele(tvp); tvp = nvp; if (error) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); VOP_UNLOCK(fvp); if (error != EBUSY) goto releout; error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; vput(nvp); atomic_add_int(&rename_restarts, 1); goto relock; } } if (DOINGSUJ(fdvp) && (seqc_in_modify(fdvp_s) || !vn_seqc_consistent(fdvp, fdvp_s) || seqc_in_modify(fvp_s) || !vn_seqc_consistent(fvp, fvp_s) || seqc_in_modify(tdvp_s) || !vn_seqc_consistent(tdvp, tdvp_s) || (tvp != NULL && (seqc_in_modify(tvp_s) || !vn_seqc_consistent(tvp, tvp_s))))) { error = softdep_prerename(fdvp, fvp, tdvp, tvp); if (error != 0) goto releout; } fdp = VTOI(fdvp); fip = VTOI(fvp); tdp = VTOI(tdvp); tip = NULL; if (tvp) tip = VTOI(tvp); if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto unlockout; } /* * Renaming a file to itself has no effect. The upper layers should * not call us in that case. However, things could change after * we drop the locks above. */ if (fvp == tvp) { error = 0; goto unlockout; } doingdirectory = 0; newparent = 0; ino = fip->i_number; if (fip->i_nlink >= UFS_LINK_MAX) { if (!DOINGSOFTDEP(fvp) || fip->i_effnlink >= UFS_LINK_MAX) { error = EMLINK; goto unlockout; } vfs_ref(mp); MPASS(!want_seqc_end); if (checkpath_locked) { sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock); checkpath_locked = false; } VOP_UNLOCK(fdvp); VOP_UNLOCK(fvp); vref(tdvp); if (tvp != NULL) vref(tvp); VOP_VPUT_PAIR(tdvp, &tvp, true); error = ufs_sync_nlink1(mp); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (fdp->i_flags & APPEND)) { error = EPERM; goto unlockout; } if ((fip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || fdp == fip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } if (fdp->i_number != tdp->i_number) newparent = tdp->i_number; doingdirectory = 1; } if ((fvp->v_type == VDIR && fvp->v_mountedhere != NULL) || (tvp != NULL && tvp->v_type == VDIR && tvp->v_mountedhere != NULL)) { error = EXDEV; goto unlockout; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". */ if (doingdirectory && newparent) { error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, curthread); if (error) goto unlockout; sx_xlock(&VFSTOUFS(mp)->um_checkpath_lock); checkpath_locked = true; error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred, &ino); /* * We encountered a lock that we have to wait for. Unlock * everything else and VGET before restarting. */ if (ino) { sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock); checkpath_locked = false; VOP_UNLOCK(fdvp); VOP_UNLOCK(fvp); VOP_UNLOCK(tdvp); if (tvp) VOP_UNLOCK(tvp); error = VFS_VGET(mp, ino, LK_SHARED, &nvp); if (error == 0) vput(nvp); atomic_add_int(&rename_restarts, 1); goto relock; } if (error) goto unlockout; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); } if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 || tdp->i_effnlink == 0) panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp); if (tvp != NULL) vn_seqc_write_begin(tvp); vn_seqc_write_begin(tdvp); vn_seqc_write_begin(fvp); vn_seqc_write_begin(fdvp); want_seqc_end = true; /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ fip->i_effnlink++; fip->i_nlink++; DIP_SET(fip, i_nlink, fip->i_nlink); UFS_INODE_SET_FLAG(fip, IN_CHANGE); if (DOINGSOFTDEP(fvp)) softdep_setup_link(tdp, fip); error = UFS_UPDATE(fvp, !DOINGSOFTDEP(fvp) && !DOINGASYNC(fvp)); if (error) goto bad; /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (tip == NULL) { if (ITODEV(tdp) != ITODEV(fip)) panic("ufs_rename: EXDEV"); if (doingdirectory && newparent) { /* * Account for ".." in new directory. * When source and destination have the same * parent we don't adjust the link count. The * actual link modification is completed when * .. is rewritten below. */ if (tdp->i_nlink >= UFS_LINK_MAX) { fip->i_effnlink--; fip->i_nlink--; DIP_SET(fip, i_nlink, fip->i_nlink); UFS_INODE_SET_FLAG(fip, IN_CHANGE); if (DOINGSOFTDEP(fvp)) softdep_revert_link(tdp, fip); if (!DOINGSOFTDEP(tdvp) || tdp->i_effnlink >= UFS_LINK_MAX) { error = EMLINK; goto unlockout; } MPASS(want_seqc_end); if (tvp != NULL) vn_seqc_write_end(tvp); vn_seqc_write_end(tdvp); vn_seqc_write_end(fvp); vn_seqc_write_end(fdvp); want_seqc_end = false; vfs_ref(mp); MPASS(checkpath_locked); sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock); checkpath_locked = false; VOP_UNLOCK(fdvp); VOP_UNLOCK(fvp); vref(tdvp); if (tvp != NULL) vref(tvp); VOP_VPUT_PAIR(tdvp, &tvp, true); error = ufs_sync_nlink1(mp); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } } ufs_makedirentry(fip, tcnp, &newdir); error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); if (error) goto bad; /* Setup tdvp for directory compaction if needed. */ if (I_COUNT(tdp) != 0 && I_ENDOFF(tdp) != 0 && I_ENDOFF(tdp) < tdp->i_size) endoff = I_ENDOFF(tdp); } else { if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip)) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (tip->i_number == fip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the caller * must possess VADMIN for the parent directory, or the * destination of the rename. This implements append-only * directories. */ if ((tdp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((tip->i_mode & IFMT) == IFDIR) { if ((tip->i_effnlink > 2) || !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } if (doingdirectory) { if (!newparent) { tdp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(tdp); } tip->i_effnlink--; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(tip); } error = ufs_dirrewrite(tdp, tip, fip->i_number, IFTODT(fip->i_mode), (doingdirectory && newparent) ? newparent : doingdirectory); if (error) { if (doingdirectory) { if (!newparent) { tdp->i_effnlink++; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(tdp); } tip->i_effnlink++; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(tip); } goto bad; } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* * The only stuff left in the directory is "." * and "..". The "." reference is inconsequential * since we are quashing it. We have removed the "." * reference and the reference in the parent directory, * but there may be other hard links. The soft * dependency code will arrange to do these operations * after the parent directory entry has been deleted on * disk, so when running with that code we avoid doing * them now. */ if (!newparent) { tdp->i_nlink--; DIP_SET(tdp, i_nlink, tdp->i_nlink); UFS_INODE_SET_FLAG(tdp, IN_CHANGE); } tip->i_nlink--; DIP_SET(tip, i_nlink, tip->i_nlink); UFS_INODE_SET_FLAG(tip, IN_CHANGE); } } /* * 3) Unlink the source. We have to resolve the path again to * fixup the directory offset and count for ufs_dirremove. */ if (fdvp == tdvp) { error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); if (error) panic("ufs_rename: from entry went away!"); if (ino != fip->i_number) panic("ufs_rename: ino mismatch %ju != %ju\n", (uintmax_t)ino, (uintmax_t)fip->i_number); } /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { /* * If tip exists we simply use its link, otherwise we must * add a new one. */ if (tip == NULL) { tdp->i_effnlink++; tdp->i_nlink++; DIP_SET(tdp, i_nlink, tdp->i_nlink); UFS_INODE_SET_FLAG(tdp, IN_CHANGE); if (DOINGSOFTDEP(tdvp)) softdep_setup_dotdot_link(tdp, fip); error = UFS_UPDATE(tdvp, !DOINGSOFTDEP(tdvp) && !DOINGASYNC(tdvp)); /* Don't go to bad here as the new link exists. */ if (error) goto unlockout; } else if (DOINGSUJ(tdvp)) /* Journal must account for each new link. */ softdep_setup_dotdot_link(tdp, fip); SET_I_OFFSET(fip, mastertemplate.dot_reclen); ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0); cache_purge(fdvp); } error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0); /* * The kern_renameat() looks up the fvp using the DELETE flag, which * causes the removal of the name cache entry for fvp. * As the relookup of the fvp is done in two steps: * ufs_lookup_ino() and then VFS_VGET(), another thread might do a * normal lookup of the from name just before the VFS_VGET() call, * causing the cache entry to be re-instantiated. * * The same issue also applies to tvp if it exists as * otherwise we may have a stale name cache entry for the new * name that references the old i-node if it has other links * or open file descriptors. */ cache_vop_rename(fdvp, fvp, tdvp, tvp, fcnp, tcnp); unlockout: if (want_seqc_end) { if (tvp != NULL) vn_seqc_write_end(tvp); vn_seqc_write_end(tdvp); vn_seqc_write_end(fvp); vn_seqc_write_end(fdvp); } if (checkpath_locked) sx_xunlock(&VFSTOUFS(mp)->um_checkpath_lock); vput(fdvp); vput(fvp); /* * If compaction or fsync was requested do it in * ffs_vput_pair() now that other locks are no longer needed. */ if (error == 0 && endoff != 0) { UFS_INODE_SET_FLAG(tdp, IN_ENDOFF); SET_I_ENDOFF(tdp, endoff); } VOP_VPUT_PAIR(tdvp, &tvp, true); return (error); bad: fip->i_effnlink--; fip->i_nlink--; DIP_SET(fip, i_nlink, fip->i_nlink); UFS_INODE_SET_FLAG(fip, IN_CHANGE); if (DOINGSOFTDEP(fvp)) softdep_revert_link(tdp, fip); goto unlockout; releout: if (want_seqc_end) { if (tvp != NULL) vn_seqc_write_end(tvp); vn_seqc_write_end(tdvp); vn_seqc_write_end(fvp); vn_seqc_write_end(fdvp); } vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp) vrele(tvp); return (error); } #ifdef UFS_ACL static int ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, mode_t dmode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *dacl, *acl; acl = acl_alloc(M_WAITOK); dacl = acl_alloc(M_WAITOK); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); UFS_INODE_SET_MODE(ip, dmode); DIP_SET(ip, i_mode, dmode); *dacl = *acl; ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ UFS_INODE_SET_MODE(ip, dmode); DIP_SET(ip, i_mode, dmode); error = 0; goto out; default: goto out; } /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); if (error == 0) error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); /* panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); acl_free(dacl); return (error); } static int ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, mode_t mode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *acl; acl = acl_alloc(M_WAITOK); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); UFS_INODE_SET_MODE(ip, mode); DIP_SET(ip, i_mode, mode); ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ UFS_INODE_SET_MODE(ip, mode); DIP_SET(ip, i_mode, mode); error = 0; goto out; default: goto out; } /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()\n"); /* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); return (error); } static int ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp, mode_t child_mode, struct ucred *cred, struct thread *td) { int error; struct acl *parent_aclp, *child_aclp; parent_aclp = acl_alloc(M_WAITOK); child_aclp = acl_alloc(M_WAITOK | M_ZERO); error = ufs_getacl_nfs4_internal(dvp, parent_aclp, td); if (error) goto out; acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp, child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR); error = ufs_setacl_nfs4_internal(tvp, child_aclp, td); if (error) goto out; out: acl_free(parent_aclp); acl_free(child_aclp); return (error); } #endif /* * Mkdir system call */ static int ufs_mkdir( struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; struct vnode *tvp; struct buf *bp; struct dirtemplate dirtemplate, *dtp; struct direct newdir; int error, dmode; long blkoff; -#ifdef INVARIANTS - if ((cnp->cn_flags & HASBUF) == 0) - panic("ufs_mkdir: no name"); -#endif dp = VTOI(dvp); error = ufs_sync_nlink(dvp, NULL); if (error != 0) goto out; dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ if (dp->i_effnlink < 2) { print_bad_link_count("ufs_mkdir", dvp); error = EINVAL; goto out; } if (DOINGSUJ(dvp)) { error = softdep_prelink(dvp, NULL, cnp); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; vn_seqc_write_begin(tvp); ip = VTOI(tvp); ip->i_gid = dp->i_gid; DIP_SET(ip, i_gid, dp->i_gid); #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; gid_t ucred_group; ucp = cnp->cn_cred; #endif /* * If we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; DIP_SET(ip, i_uid, dp->i_uid); #ifdef QUOTA if (dp->i_uid != cnp->cn_cred->cr_uid) { /* * Make sure the correct user gets charged * for the space. * Make a dummy credential for the victim. * XXX This seems to never be accessed out of * our context so a stack variable is ok. */ refcount_init(&ucred.cr_ref, 1); ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; ucred.cr_groups[0] = dp->i_gid; ucp = &ucred; } #endif } else { ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); } #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ UFS_INODE_SET_FLAG(ip, IN_ACCESS | IN_CHANGE | IN_UPDATE); UFS_INODE_SET_MODE(ip, dmode); DIP_SET(ip, i_mode, dmode); tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 2; ip->i_nlink = 2; DIP_SET(ip, i_nlink, 2); if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); } /* * Bump link count in parent directory to reflect work done below. * Should be done before reference is created so cleanup is * possible if we crash. */ dp->i_effnlink++; dp->i_nlink++; DIP_SET(dp, i_nlink, dp->i_nlink); UFS_INODE_SET_FLAG(dp, IN_CHANGE); if (DOINGSOFTDEP(dvp)) softdep_setup_mkdir(dp, ip); error = UFS_UPDATE(dvp, !DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)); if (error) goto bad; #ifdef MAC if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, dvp, tvp, cnp); if (error) goto bad; } #endif #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, cnp->cn_cred, curthread); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode, cnp->cn_cred, curthread); if (error) goto bad; } #endif /* !UFS_ACL */ /* * Initialize directory with "." and ".." from static template. */ if (!OFSFMT(dvp)) dtp = &mastertemplate; else dtp = (struct dirtemplate *)&omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; vnode_pager_setsize(tvp, DIRBLKSIZ); if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred, BA_CLRBUF, &bp)) != 0) goto bad; ip->i_size = DIRBLKSIZ; DIP_SET(ip, i_size, DIRBLKSIZ); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate); if (DOINGSOFTDEP(tvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff = DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } } if ((error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp))) != 0) { (void)bwrite(bp); goto bad; } /* * Directory set up, now install its entry in the parent directory. * * If we are not doing soft dependencies, then we must write out the * buffer containing the new directory body before entering the new * name in the parent. If we are doing soft dependencies, then the * buffer containing the new directory body will be passed to and * released in the soft dependency code after the code has attached * an appropriate ordering dependency to the buffer which ensures that * the buffer is written before the new name is written in the parent. */ if (DOINGASYNC(dvp)) bdwrite(bp); else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp)))) goto bad; ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, bp); bad: if (error == 0) { *ap->a_vpp = tvp; vn_seqc_write_end(tvp); } else { dp->i_effnlink--; dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); UFS_INODE_SET_FLAG(dp, IN_CHANGE); /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. */ ip->i_effnlink = 0; ip->i_nlink = 0; DIP_SET(ip, i_nlink, 0); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(tvp)) softdep_revert_mkdir(dp, ip); vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); } out: return (error); } /* * Rmdir system call. */ static int ufs_rmdir( struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error; ip = VTOI(vp); dp = VTOI(dvp); /* * Do not remove a directory that is in the process of being renamed. * Verify the directory is empty (and valid). Rmdir ".." will not be * valid since ".." will contain a reference to the current directory * and thus be non-empty. Do not allow the removal of mounted on * directories (this can happen when an NFS exported filesystem * tries to remove a locally mounted on directory). */ error = 0; if (dp->i_effnlink <= 2) { if (dp->i_effnlink == 2) print_bad_link_count("ufs_rmdir", dvp); error = EINVAL; goto out; } if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } if (vp->v_mountedhere != 0) { error = EINVAL; goto out; } if (DOINGSUJ(dvp)) { error = softdep_prelink(dvp, vp, cnp); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ dp->i_effnlink--; ip->i_effnlink--; if (DOINGSOFTDEP(vp)) softdep_setup_rmdir(dp, ip); error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) { dp->i_effnlink++; ip->i_effnlink++; if (DOINGSOFTDEP(vp)) softdep_revert_rmdir(dp, ip); goto out; } /* * The only stuff left in the directory is "." and "..". The "." * reference is inconsequential since we are quashing it. The soft * dependency code will arrange to do these operations after * the parent directory entry has been deleted on disk, so * when running with that code we avoid doing them now. */ if (!DOINGSOFTDEP(vp)) { dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); UFS_INODE_SET_FLAG(dp, IN_CHANGE); error = UFS_UPDATE(dvp, 0); ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); } cache_vop_rmdir(dvp, vp); #ifdef UFS_DIRHASH /* Kill any active hash; i_effnlink == 0, so it will not come back. */ if (ip->i_dirhash != NULL) ufsdirhash_free(ip); #endif out: return (error); } /* * symlink -- make a symbolic link */ static int ufs_symlink( struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; const char *a_target; } */ *ap) { struct vnode *vp, **vpp = ap->a_vpp; struct inode *ip; int len, error; error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp, "ufs_symlink"); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); if (len < VFSTOUFS(vp->v_mount)->um_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, DIP(ip, i_shortlink), len); ip->i_size = len; DIP_SET(ip, i_size, len); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); error = UFS_UPDATE(vp, 0); } else error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target), len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, ap->a_cnp->cn_cred, NOCRED, NULL, NULL); if (error) vput(vp); return (error); } /* * Vnode op for reading directories. */ int ufs_readdir( struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; uint64_t **a_cookies; } */ *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct buf *bp; struct inode *ip; struct direct *dp, *edp; uint64_t *cookies; struct dirent dstdp; off_t offset, startoffset; size_t readcnt, skipcnt; ssize_t startresid; u_int ncookies; int error; if (uio->uio_offset < 0) return (EINVAL); ip = VTOI(vp); if (ip->i_effnlink == 0) return (0); if (ap->a_ncookies != NULL) { if (uio->uio_resid < 0) ncookies = 0; else ncookies = uio->uio_resid; if (uio->uio_offset >= ip->i_size) ncookies = 0; else if (ip->i_size - uio->uio_offset < ncookies) ncookies = ip->i_size - uio->uio_offset; ncookies = ncookies / (offsetof(struct direct, d_name) + 4) + 1; cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } else { ncookies = 0; cookies = NULL; } offset = startoffset = uio->uio_offset; startresid = uio->uio_resid; error = 0; while (error == 0 && uio->uio_resid > 0 && uio->uio_offset < ip->i_size) { error = UFS_BLKATOFF(vp, uio->uio_offset, NULL, &bp); if (error) break; if (bp->b_offset + bp->b_bcount > ip->i_size) readcnt = ip->i_size - bp->b_offset; else readcnt = bp->b_bcount; skipcnt = (size_t)(uio->uio_offset - bp->b_offset) & ~(size_t)(DIRBLKSIZ - 1); offset = bp->b_offset + skipcnt; dp = (struct direct *)&bp->b_data[skipcnt]; edp = (struct direct *)&bp->b_data[readcnt]; while (error == 0 && uio->uio_resid > 0 && dp < edp) { if (dp->d_reclen <= offsetof(struct direct, d_name) || (caddr_t)dp + dp->d_reclen > (caddr_t)edp) { error = EIO; break; } #if BYTE_ORDER == LITTLE_ENDIAN /* Old filesystem format. */ if (OFSFMT(vp)) { dstdp.d_namlen = dp->d_type; dstdp.d_type = dp->d_namlen; } else #endif { dstdp.d_namlen = dp->d_namlen; dstdp.d_type = dp->d_type; } if (offsetof(struct direct, d_name) + dstdp.d_namlen > dp->d_reclen) { error = EIO; break; } if (offset < startoffset || dp->d_ino == 0) goto nextentry; dstdp.d_fileno = dp->d_ino; dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); /* NOTE: d_off is the offset of the *next* entry. */ dstdp.d_off = offset + dp->d_reclen; dirent_terminate(&dstdp); if (dstdp.d_reclen > uio->uio_resid) { if (uio->uio_resid == startresid) error = EINVAL; else error = EJUSTRETURN; break; } /* Advance dp. */ error = uiomove((caddr_t)&dstdp, dstdp.d_reclen, uio); if (error) break; if (cookies != NULL) { KASSERT(ncookies > 0, ("ufs_readdir: cookies buffer too small")); *cookies = offset + dp->d_reclen; cookies++; ncookies--; } nextentry: offset += dp->d_reclen; dp = (struct direct *)((caddr_t)dp + dp->d_reclen); } bqrelse(bp); uio->uio_offset = offset; } /* We need to correct uio_offset. */ uio->uio_offset = offset; if (error == EJUSTRETURN) error = 0; if (ap->a_ncookies != NULL) { if (error == 0) { ap->a_ncookies -= ncookies; } else { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } } if (error == 0 && ap->a_eofflag) *ap->a_eofflag = ip->i_size <= uio->uio_offset; return (error); } /* * Return target name of a symbolic link */ static int ufs_readlink( struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); doff_t isize; isize = ip->i_size; if (isize < VFSTOUFS(vp->v_mount)->um_maxsymlinklen) return (uiomove(DIP(ip, i_shortlink), isize, ap->a_uio)); return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the ufs_bmaparray() operation may not * deadlock on memory. See ufs_bmap() for details. */ static int ufs_strategy( struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap) { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; ufs2_daddr_t blkno; int error; if (bp->b_blkno == bp->b_lblkno) { error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); BO_STRATEGY(VFSTOUFS(vp->v_mount)->um_bo, bp); return (0); } /* * Print out the contents of an inode. */ static int ufs_print( struct vop_print_args /* { struct vnode *a_vp; } */ *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); printf("\tnlink=%d, effnlink=%d, size=%jd", ip->i_nlink, ip->i_effnlink, (intmax_t)ip->i_size); if (I_IS_UFS2(ip)) printf(", extsize %d", ip->i_din2->di_extsize); printf("\n\tgeneration=%jx, uid=%d, gid=%d, flags=0x%b\n", (uintmax_t)ip->i_gen, ip->i_uid, ip->i_gid, (u_int)ip->i_flags, PRINT_INODE_FLAGS); printf("\tino %lu, on dev %s", (u_long)ip->i_number, devtoname(ITODEV(ip))); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int ufsfifo_close( struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap) { ufs_close(ap); return (fifo_specops.vop_close(ap)); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ static int ufs_pathconf( struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap) { int error; error = 0; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = UFS_LINK_MAX; break; case _PC_NAME_MAX: *ap->a_retval = UFS_MAXNAMLEN; break; case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) *ap->a_retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; break; case _PC_NO_TRUNC: *ap->a_retval = 1; break; #ifdef UFS_ACL case _PC_ACL_EXTENDED: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; break; case _PC_ACL_NFS4: if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; break; #endif case _PC_ACL_PATH_MAX: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; #else *ap->a_retval = 3; #endif break; #ifdef MAC case _PC_MAC_PRESENT: if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL) *ap->a_retval = 1; else *ap->a_retval = 0; break; #endif case _PC_MIN_HOLE_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_PRIO_IO: *ap->a_retval = 0; break; case _PC_SYNC_IO: *ap->a_retval = 0; break; case _PC_ALLOC_SIZE_MIN: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; break; case _PC_FILESIZEBITS: *ap->a_retval = 64; break; case _PC_REC_INCR_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_MAX_XFER_SIZE: *ap->a_retval = -1; /* means ``unlimited'' */ break; case _PC_REC_MIN_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_XFER_ALIGN: *ap->a_retval = PAGE_SIZE; break; case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; break; default: error = vop_stdpathconf(ap); break; } return (error); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ufs_vinit(struct mount *mntp, struct vop_vector *fifoops, struct vnode **vpp) { struct inode *ip; struct vnode *vp; vp = *vpp; ASSERT_VOP_LOCKED(vp, "ufs_vinit"); ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); /* * Only unallocated inodes should be of type VNON. */ if (ip->i_mode != 0 && vp->v_type == VNON) return (EINVAL); if (vp->v_type == VFIFO) vp->v_op = fifoops; if (ip->i_number == UFS_ROOTINO) vp->v_vflag |= VV_ROOT; *vpp = vp; return (0); } /* * Allocate a new inode. * Vnode dvp must be locked. */ static int ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, const char *callfunc) { struct inode *ip, *pdir; struct direct newdir; struct vnode *tvp; int error; pdir = VTOI(dvp); -#ifdef INVARIANTS - if ((cnp->cn_flags & HASBUF) == 0) - panic("%s: no name", callfunc); -#endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; if (pdir->i_effnlink < 2) { print_bad_link_count(callfunc, dvp); return (EINVAL); } if (DOINGSUJ(dvp)) { error = softdep_prelink(dvp, NULL, cnp); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error); ip = VTOI(tvp); ip->i_gid = pdir->i_gid; DIP_SET(ip, i_gid, pdir->i_gid); #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; gid_t ucred_group; ucp = cnp->cn_cred; #endif /* * If we are not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; DIP_SET(ip, i_uid, ip->i_uid); mode &= ~07111; #ifdef QUOTA /* * Make sure the correct user gets charged * for the space. * Quickly knock up a dummy credential for the victim. * XXX This seems to never be accessed out of our * context so a stack variable is ok. */ refcount_init(&ucred.cr_ref, 1); ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; ucred.cr_groups[0] = pdir->i_gid; ucp = &ucred; #endif } else { ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); } #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(pdir, ip); UFS_VFREE(tvp, ip->i_number, mode); vgone(tvp); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(pdir, ip); UFS_VFREE(tvp, ip->i_number, mode); vgone(tvp); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ vn_seqc_write_begin(tvp); /* Mostly to cover asserts */ UFS_INODE_SET_FLAG(ip, IN_ACCESS | IN_CHANGE | IN_UPDATE); UFS_INODE_SET_MODE(ip, mode); DIP_SET(ip, i_mode, mode); tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 1; ip->i_nlink = 1; DIP_SET(ip, i_nlink, 1); if (DOINGSOFTDEP(tvp)) softdep_setup_create(VTOI(dvp), ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID)) { UFS_INODE_SET_MODE(ip, ip->i_mode & ~ISGID); DIP_SET(ip, i_mode, ip->i_mode); } if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); } /* * Make sure inode goes to disk before directory entry. */ error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp)); if (error) goto bad; #ifdef MAC if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, dvp, tvp, cnp); if (error) goto bad; } #endif #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp, mode, cnp->cn_cred, curthread); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, mode, cnp->cn_cred, curthread); if (error) goto bad; } #endif /* !UFS_ACL */ ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL); if (error) goto bad; vn_seqc_write_end(tvp); *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_effnlink = 0; ip->i_nlink = 0; DIP_SET(ip, i_nlink, 0); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(tvp)) softdep_revert_create(VTOI(dvp), ip); vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); return (error); } static int ufs_ioctl(struct vop_ioctl_args *ap) { struct vnode *vp; int error; vp = ap->a_vp; switch (ap->a_command) { case FIOSEEKDATA: error = vn_lock(vp, LK_SHARED); if (error == 0) { error = ufs_bmap_seekdata(vp, (off_t *)ap->a_data); VOP_UNLOCK(vp); } else error = EBADF; return (error); case FIOSEEKHOLE: return (vn_bmap_seekhole(vp, ap->a_command, (off_t *)ap->a_data, ap->a_cred)); default: return (ENOTTY); } } static int ufs_read_pgcache(struct vop_read_pgcache_args *ap) { struct uio *uio; struct vnode *vp; uio = ap->a_uio; vp = ap->a_vp; VNPASS((vn_irflag_read(vp) & VIRF_PGREAD) != 0, vp); if (uio->uio_resid > ptoa(io_hold_cnt) || uio->uio_offset < 0 || (ap->a_ioflag & IO_DIRECT) != 0) return (EJUSTRETURN); return (vn_read_from_obj(vp, uio)); } /* Global vfs data structures for ufs. */ struct vop_vector ufs_vnodeops = { .vop_default = &default_vnodeops, .vop_fsync = VOP_PANIC, .vop_read = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_write = VOP_PANIC, .vop_accessx = ufs_accessx, .vop_bmap = ufs_bmap, .vop_fplookup_vexec = ufs_fplookup_vexec, .vop_fplookup_symlink = VOP_EAGAIN, .vop_cachedlookup = ufs_lookup, .vop_close = ufs_close, .vop_create = ufs_create, .vop_stat = ufs_stat, .vop_getattr = ufs_getattr, .vop_inactive = ufs_inactive, .vop_ioctl = ufs_ioctl, .vop_link = ufs_link, .vop_lookup = vfs_cache_lookup, .vop_mmapped = ufs_mmapped, .vop_mkdir = ufs_mkdir, .vop_mknod = ufs_mknod, .vop_need_inactive = ufs_need_inactive, .vop_open = ufs_open, .vop_pathconf = ufs_pathconf, .vop_poll = vop_stdpoll, .vop_print = ufs_print, .vop_read_pgcache = ufs_read_pgcache, .vop_readdir = ufs_readdir, .vop_readlink = ufs_readlink, .vop_reclaim = ufs_reclaim, .vop_remove = ufs_remove, .vop_rename = ufs_rename, .vop_rmdir = ufs_rmdir, .vop_setattr = ufs_setattr, #ifdef MAC .vop_setlabel = vop_stdsetlabel_ea, #endif .vop_strategy = ufs_strategy, .vop_symlink = ufs_symlink, .vop_whiteout = ufs_whiteout, #ifdef UFS_EXTATTR .vop_getextattr = ufs_getextattr, .vop_deleteextattr = ufs_deleteextattr, .vop_setextattr = ufs_setextattr, #endif #ifdef UFS_ACL .vop_getacl = ufs_getacl, .vop_setacl = ufs_setacl, .vop_aclcheck = ufs_aclcheck, #endif }; VFS_VOP_VECTOR_REGISTER(ufs_vnodeops); struct vop_vector ufs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = VOP_PANIC, .vop_accessx = ufs_accessx, .vop_close = ufsfifo_close, .vop_getattr = ufs_getattr, .vop_inactive = ufs_inactive, .vop_pathconf = ufs_pathconf, .vop_print = ufs_print, .vop_read = VOP_PANIC, .vop_reclaim = ufs_reclaim, .vop_setattr = ufs_setattr, #ifdef MAC .vop_setlabel = vop_stdsetlabel_ea, #endif .vop_write = VOP_PANIC, #ifdef UFS_EXTATTR .vop_getextattr = ufs_getextattr, .vop_deleteextattr = ufs_deleteextattr, .vop_setextattr = ufs_setextattr, #endif #ifdef UFS_ACL .vop_getacl = ufs_getacl, .vop_setacl = ufs_setacl, .vop_aclcheck = ufs_aclcheck, #endif }; VFS_VOP_VECTOR_REGISTER(ufs_fifoops);