diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index fa3085d2def9..30e312104bbb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -1,6070 +1,6067 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, int *rvalp, caller_context_t *ct) { offset_t off; offset_t ndata; dmu_object_info_t doi; int error; zfsvfs_t *zfsvfs; znode_t *zp; switch (com) { case _FIOFFS: { return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ } case _FIOGDIO: case _FIOSDIO: { return (0); } case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: { #ifdef illumos if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else off = *(offset_t *)data; #endif zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); #ifdef illumos if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else *(offset_t *)data = off; #endif return (0); } #ifdef illumos case _FIO_COUNT_FILLED: { /* * _FIO_COUNT_FILLED adds a new ioctl command which * exposes the number of filled blocks in a * ZFS object. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * Wait for all dirty blocks for this object * to get synced out to disk, and the DMU info * updated. */ error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); if (error) { ZFS_EXIT(zfsvfs); return (error); } /* * Retrieve fill count from DMU object. */ error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); if (error) { ZFS_EXIT(zfsvfs); return (error); } ndata = doi.doi_fill_count; ZFS_EXIT(zfsvfs); if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) return (SET_ERROR(EFAULT)); return (0); } #endif } return (SET_ERROR(ENOTTY)); } static vm_page_t page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) { vm_object_t obj; vm_page_t pp; int64_t end; /* * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE * aligned boundaries, if the range is not aligned. As a result a * DEV_BSIZE subrange with partially dirty data may get marked as clean. * It may happen that all DEV_BSIZE subranges are marked clean and thus * the whole page would be considred clean despite have some dirty data. * For this reason we should shrink the range to DEV_BSIZE aligned * boundaries before calling vm_page_clear_dirty. */ end = rounddown2(off + nbytes, DEV_BSIZE); off = roundup2(off, DEV_BSIZE); nbytes = end - off; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } vm_page_sbusy(pp); } else if (pp != NULL) { ASSERT(!pp->valid); pp = NULL; } if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } break; } return (pp); } static void page_unbusy(vm_page_t pp) { vm_page_sunbusy(pp); vm_object_pip_wakeup(pp->object); } static vm_page_t page_wire(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t pp; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_lock(pp); vm_page_wire(pp); vm_page_unlock(pp); } else pp = NULL; break; } return (pp); } static void page_unwire(vm_page_t pp) { vm_page_lock(pp); vm_page_unwire(pp, PQ_ACTIVE); vm_page_unlock(pp); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, int segflg, dmu_tx_t *tx) { vm_object_t obj; struct sf_buf *sf; caddr_t va; int off; ASSERT(segflg != UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); vm_object_pip_add(obj, 1); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; int nbytes = imin(PAGESIZE - off, len); if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); (void) dmu_read(os, oid, start+off, nbytes, va+off, DMU_READ_PREFETCH);; zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unbusy(pp); } len -= nbytes; off = 0; } vm_object_pip_wakeup(obj); zfs_vmobject_wunlock(obj); } /* * Read with UIO_NOCOPY flag means that sendfile(2) requests * ZFS to populate a range of page cache pages with data. * * NOTE: this function could be optimized to pre-allocate * all pages in advance, drain exclusive busy on all of them, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ static int mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; vm_page_t pp; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(uio->uio_segflg == UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); zfs_vmobject_wlock(obj); for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (pp->valid == 0) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock(obj); vm_page_sunbusy(pp); vm_page_lock(pp); if (error) { if (!vm_page_wired(pp) && pp->valid == 0 && !vm_page_busied(pp)) vm_page_free(pp); } else { pp->valid = VM_PAGE_BITS_ALL; vm_page_activate(pp); } vm_page_unlock(pp); } else { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_sunbusy(pp); } if (error) break; uio->uio_resid -= bytes; uio->uio_offset += bytes; len -= bytes; } zfs_vmobject_wunlock(obj); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); vm_object_t obj; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); if (pp = page_wire(vp, start)) { struct sf_buf *sf; caddr_t va; zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); #ifdef illumos error = uiomove(va + off, bytes, UIO_READ, uio); #else error = vn_io_fault_uiomove(va + off, bytes, uio); #endif zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unwire(pp); } else { zfs_vmobject_wunlock(obj); error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); zfs_vmobject_wlock(obj); } len -= bytes; off = 0; if (error) break; } zfs_vmobject_wunlock(obj); return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ssize_t n, nbytes; int error = 0; rl_t *rl; xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (zfsvfs->z_log && (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); #ifdef illumos if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; int blksz = zp->z_blksz; uint64_t offset = uio->uio_loffset; xuio = (xuio_t *)uio; if ((ISP2(blksz))) { nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, blksz)) / blksz; } else { ASSERT(offset + n <= blksz); nblk = 1; } (void) dmu_xuio_init(xuio, nblk); if (vn_has_cached_data(vp)) { /* * For simplicity, we always allocate a full buffer * even if we only expect to read a portion of a block. */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz), 0, blksz); } } } #endif /* illumos */ while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); #ifdef __FreeBSD__ if (uio->uio_segflg == UIO_NOCOPY) error = mappedread_sf(vp, nbytes, uio); else #endif /* __FreeBSD__ */ if (vn_has_cached_data(vp)) { error = mappedread(vp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } out: zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = MAXOFFSET_T; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our * callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM. * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common() */ if ((zp->z_pflags & ZFS_IMMUTABLE) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } zilog = zfsvfs->z_log; /* * Validate file offset */ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Check for mandatory locks before calling zfs_range_lock() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); } #ifdef illumos /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else uio_prefaultpages(MIN(n, max_blksz), uio); #endif /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); woff = rl->r_off; if (rl->r_len == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (EFBIG); } if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ write_eof = (woff + n > zp->z_size); end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { abuf = NULL; woff = uio->uio_loffset; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = SET_ERROR(EDQUOT); break; } if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); DTRACE_PROBE3(zfs_cp_write, int, i_iov, iovec_t *, aiov, arc_buf_t *, abuf); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (rl->r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_range_reduce(rl, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (woff + nbytes > zp->z_size) vnode_pager_setsize(vp, woff + nbytes); if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id, uio->uio_segflg, tx); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(vp, cr, (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); #ifdef illumos ASSERT(error == 0); #else ASSERT(error == 0 || error == EFAULT); #endif } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; if (error == 0) error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); else (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; #ifdef illumos if (!xuio && n > 0) uio_prefaultpages(MIN(n, max_blksz), uio); #endif } zfs_range_unlock(rl); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } #ifdef __FreeBSD__ /* * EFAULT means that at least one page of the source buffer was not * available. VFS will re-try remaining I/O upon this error. */ if (error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } #endif if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; objset_t *os = zp->z_zfsvfs->z_os; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_range_unlock(zgd->zgd_rl); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); kmem_free(zgd, sizeof (zgd_t)); } #ifdef DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP. */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } static int zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { int error; *vpp = arg; error = vn_lock(*vpp, lkflags); if (error != 0) vrele(*vpp); return (error); } static int zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error; int ltype; ASSERT_VOP_LOCKED(dvp, __func__); #ifdef DIAGNOSTIC if ((zdp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); #endif if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { ASSERT3P(dvp, ==, vp); vref(dvp); ltype = lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case could leave us with * reclaimed vnode. */ if (dvp->v_iflag & VI_DOOMED) { vrele(dvp); return (SET_ERROR(ENOENT)); } } return (0); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { /* * Note that in this case, dvp is the child vnode, and we * are looking up the parent vnode - exactly reverse from * normal operation. Unlocking dvp requires some rather * tricky unlock/relock dance to prevent mp from being freed; * use vn_vget_ino_gen() which takes care of all that. * * XXX Note that there is a time window when both vnodes are * unlocked. It is possible, although highly unlikely, that * during that window the parent-child relationship between * the vnodes may change, for example, get reversed. * In that case we would have a wrong lock order for the vnodes. * All other filesystems seem to ignore this problem, so we * do the same here. * A potential solution could be implemented as follows: * - using LK_NOWAIT when locking the second vnode and retrying * if necessary * - checking that the parent-child relationship still holds * after locking both vnodes and retrying if it doesn't */ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); return (error); } else { error = vn_lock(vp, lkflags); if (error != 0) vrele(vp); return (error); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td, int flags) { znode_t *zdp = VTOZ(dvp); znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *vpp = NULL; if (flags & LOOKUP_XATTR) { #ifdef TODO /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } #endif /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, B_FALSE, cr)) { vrele(*vpp); *vpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } /* * Check accessibility of directory. */ if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * First handle the special cases. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { struct componentname cn; vnode_t *zfsctl_vp; int ltype; ZFS_EXIT(zfsvfs); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, &zfsctl_vp); if (error == 0) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = cnp->cn_nameiop; cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; cn.cn_lkflags = cnp->cn_lkflags; error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); vput(zfsctl_vp); } vn_lock(dvp, ltype | LK_RETRY); return (error); } } if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { ZFS_EXIT(zfsvfs); if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); return (error); } /* * The loop is retry the lookup if the parent-child relationship * changes during the dot-dot locking complexities. */ for (;;) { uint64_t parent; error = zfs_dirlook(zdp, nm, &zp); if (error == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (error != 0) break; error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); if (error != 0) { /* * If we've got a locking error, then the vnode * got reclaimed because of a force unmount. * We never enter doomed vnodes into the name cache. */ *vpp = NULL; return (error); } if ((cnp->cn_flags & ISDOTDOT) == 0) break; ZFS_ENTER(zfsvfs); if (zdp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); } else { error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); } if (error != 0) { ZFS_EXIT(zfsvfs); vput(ZTOV(zp)); break; } if (zp->z_id == parent) { ZFS_EXIT(zfsvfs); break; } vput(ZTOV(zp)); } out: if (error != 0) *vpp = NULL; /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } /* FALLTHROUGH */ case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } /* Insert name into cache (as non-existent) if appropriate. */ if (zfsvfs->z_use_namecache && error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(dvp, NULL, cnp); /* Insert name into cache if appropriate. */ if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; void *vsecp = NULL; int flag = 0; uint64_t txtype; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } *vpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { error = SET_ERROR(EINVAL); goto out; } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } getnewvnode_reserve(1); tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); out: if (error == 0) { *vpp = ZTOV(zp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); znode_t *xzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t obj = 0; dmu_tx_t *tx; boolean_t unlinked, toobig = FALSE; uint64_t txtype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; zp = VTOZ(vp); xattr_obj = 0; xzp = NULL; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); obj = zp->z_id; /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); } /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (xzp) { dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { zfs_unlinked_add(zp, tx); vp->v_vflag |= VV_NOSYNC; } txtype = TX_REMOVE; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: if (xzp) vrele(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t txtype; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && ((vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ *vpp = NULL; if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *vpp = ZTOV(zp); txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } vnevent_rmdir(vp, dvp, name, ct); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } cache_purge(dvp); error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); cache_purge(vp); out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; int ncooks; u_long *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ #ifdef TODO check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); #else check_sysattrs = 0; #endif /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); if (check_sysattrs && !zap.za_normalization_conflict) { #ifdef TODO zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); #else panic("%s:%u: TODO", __func__, __LINE__); #endif } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { vrele(ZTOV(ezp)); goto skip_entry; } vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry. */ next = &eodp->ed_off; eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); /* NOTE: d_off is the offset for the *next* entry. */ next = &odp->d_off; (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; dirent_terminate(odp); odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } /* Fill the offset right after advancing the cursor. */ if (next != NULL) *next = offset; if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } ulong_t zfs_fsync_sync_cnt = 4; static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * ct - caller context * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[4]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; #ifdef illumos vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; #else vn_fsid(vp, vap); #endif vap->va_nodeid = zp->z_id; vap->va_nlink = zp->z_links; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && zp->z_links < ZFS_LINK_MAX) vap->va_nlink++; vap->va_size = zp->z_size; #ifdef illumos vap->va_rdev = vp->v_rdev; #else if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); #endif vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Note: ZFS_READONLY is handled in zfs_zaccess_common. */ /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && TIMESPEC_OVERFLOW(&vap->va_birthtime)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; if (trim_mask & AT_MODE) { /* * Save the mode, as secpolicy_vnode_setattr() * will overwrite it with ova.va_mode. */ saved_mode = vap->va_mode; } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) { vap->va_mask |= saved_mask; if (trim_mask & AT_MODE) { /* * Recover the mode after * secpolicy_vnode_setattr(). */ vap->va_mode = saved_mode; } } } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID))) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err == 0) { err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); if (err != 0) vrele(ZTOV(attrzp)); } if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } } tx = dmu_tx_create(zfsvfs->z_os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, 0); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime, B_TRUE); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) xoap->xoa_createtime = vap->va_birthtime; /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (attrzp) vput(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, struct vnode *tdvp, struct vnode **tvpp, const struct componentname *scnp, const struct componentname *tcnp) { zfsvfs_t *zfsvfs; struct vnode *nvp, *svp, *tvp; znode_t *sdzp, *tdzp, *szp, *tzp; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error; VOP_UNLOCK(tdvp, 0); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK(*tvpp, 0); relock: error = vn_lock(sdvp, LK_EXCLUSIVE); if (error) goto out; sdzp = VTOZ(sdvp); error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); if (error != EBUSY) goto out; error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto out; VOP_UNLOCK(tdvp, 0); goto relock; } tdzp = VTOZ(tdvp); /* * Before using sdzp and tdzp we must ensure that they are live. * As a porting legacy from illumos we have two things to worry * about. One is typical for FreeBSD and it is that the vnode is * not reclaimed (doomed). The other is that the znode is live. * The current code can invalidate the znode without acquiring the * corresponding vnode lock if the object represented by the znode * and vnode is no longer valid after a rollback or receive operation. * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock * that protects the znodes from the invalidation. */ zfsvfs = sdzp->z_zfsvfs; ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); ZFS_ENTER(zfsvfs); /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); error = SET_ERROR(EIO); goto out; } /* * Re-resolve svp to be certain it still exists and fetch the * correct vnode. */ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); if (error != 0) { /* Source entry invalid or not there. */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if ((scnp->cn_flags & ISDOTDOT) != 0 || (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) error = SET_ERROR(EINVAL); goto out; } svp = ZTOV(szp); /* * Re-resolve tvp, if it disappeared we just carry on. */ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); if (error != 0) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); vrele(svp); if ((tcnp->cn_flags & ISDOTDOT) != 0) error = SET_ERROR(EINVAL); goto out; } if (tzp != NULL) tvp = ZTOV(tzp); else tvp = NULL; /* * At present the vnode locks must be acquired before z_teardown_lock, * although it would be more logical to use the opposite order. */ ZFS_EXIT(zfsvfs); /* * Now try acquire locks on svp and tvp. */ nvp = svp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if (tvp != NULL) vrele(tvp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } VOP_UNLOCK(nvp, 0); /* * Concurrent rename race. * XXX ? */ if (nvp == tdvp) { vrele(nvp); error = SET_ERROR(EINVAL); goto out; } vrele(*svpp); *svpp = nvp; goto relock; } vrele(*svpp); *svpp = nvp; if (*tvpp != NULL) vrele(*tvpp); *tvpp = NULL; if (tvp != NULL) { nvp = tvp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); VOP_UNLOCK(*svpp, 0); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } vput(nvp); goto relock; } *tvpp = nvp; } return (0); out: return (error); } /* * Note that we must use VRELE_ASYNC in this function as it walks * up the directory tree and vrele may need to acquire an exclusive * lock if a last reference to a vnode is dropped. */ static int zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) { zfsvfs_t *zfsvfs; znode_t *zp, *zp1; uint64_t parent; int error; zfsvfs = tdzp->z_zfsvfs; if (tdzp == szp) return (SET_ERROR(EINVAL)); if (tdzp == sdzp) return (0); if (tdzp->z_id == zfsvfs->z_root) return (0); zp = tdzp; for (;;) { ASSERT(!zp->z_unlinked); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) break; if (parent == szp->z_id) { error = SET_ERROR(EINVAL); break; } if (parent == zfsvfs->z_root) break; if (parent == sdzp->z_id) break; error = zfs_zget(zfsvfs, parent, &zp1); if (error != 0) break; if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); zp = zp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); return (error); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr) { zfsvfs_t *zfsvfs; znode_t *sdzp, *tdzp, *szp, *tzp; zilog_t *zilog = NULL; dmu_tx_t *tx; char *snm = scnp->cn_nameptr; char *tnm = tcnp->cn_nameptr; int error = 0; /* Reject renames across filesystems. */ if ((*svpp)->v_mount != tdvp->v_mount || ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { error = SET_ERROR(EXDEV); goto out; } if (zfsctl_is_node(tdvp)) { error = SET_ERROR(EXDEV); goto out; } /* * Lock all four vnodes to ensure safety and semantics of renaming. */ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); if (error != 0) { /* no vnodes are locked in the case of error here */ return (error); } tdzp = VTOZ(tdvp); sdzp = VTOZ(sdvp); zfsvfs = tdzp->z_zfsvfs; zilog = zfsvfs->z_log; /* * After we re-enter ZFS_ENTER() we will have to revalidate all * znodes involved. */ ZFS_ENTER(zfsvfs); if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { error = SET_ERROR(EILSEQ); goto unlockout; } /* If source and target are the same file, there is nothing to do. */ if ((*svpp) == (*tvpp)) { error = 0; goto unlockout; } if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && (*tvpp)->v_mountedhere != NULL)) { error = SET_ERROR(EXDEV); goto unlockout; } /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); goto unlockout; } szp = VTOZ(*svpp); tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { error = SET_ERROR(EIO); goto unlockout; } /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { error = SET_ERROR(EINVAL); goto unlockout; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto unlockout; if ((*svpp)->v_type == VDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || sdzp == szp || (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_check(szp, sdzp, tdzp)) goto unlockout; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if ((*svpp)->v_type == VDIR) { if ((*tvpp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto unlockout; } else { cache_purge(tdvp); if (sdvp != tdvp) cache_purge(sdvp); } } else { if ((*tvpp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto unlockout; } } } vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto unlockout; } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME, sdzp, snm, tdzp, tnm, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, ZRENAMING, NULL), ==, 0); } } if (error == 0) { cache_purge(*svpp); if (*tvpp != NULL) cache_purge(*tvpp); cache_purge_negative(tdvp); } } dmu_tx_commit(tx); unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(*svpp, 0); VOP_UNLOCK(sdvp, 0); out: /* original two vnodes are locked */ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (*tvpp != NULL) VOP_UNLOCK(*tvpp, 0); if (tdvp != *tvpp) VOP_UNLOCK(tdvp, 0); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; int flags = 0; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dzp, name, zp, tx, ZNEW); zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *vpp = ZTOV(zp); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; uint64_t parent; uid_t owner; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } szp = VTOZ(svp); ZFS_VERIFY_ZP(szp); if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dzp, name, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } dmu_tx_commit(tx); if (error == 0) { vnevent_link(svp, ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_unlinked) { /* * Fast path to recycle a vnode of a removed file. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; dmu_tx_commit(tx); } } rw_exit(&zfsvfs->z_teardown_inactive_lock); } CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); /*ARGSUSED*/ static int zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; #ifdef illumos if (fidp->fid_len < size) { fidp->fid_len = size; ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSPC)); } #else fidp->fid_len = size; #endif zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { znode_t *zp, *xzp; zfsvfs_t *zfsvfs; int error; switch (cmd) { case _PC_LINK_MAX: *valp = MIN(LONG_MAX, ZFS_LINK_MAX); return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); #ifdef illumos case _PC_XATTR_EXISTS: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = 0; error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR | ZEXISTS | ZSHARED); if (error == 0) { if (!zfs_dirempty(xzp)) *valp = 1; vrele(ZTOV(xzp)); } else if (error == ENOENT) { /* * If there aren't extended attributes, it's the * same as having zero of them. */ error = 0; } ZFS_EXIT(zfsvfs); return (error); case _PC_SATTR_ENABLED: case _PC_SATTR_EXISTS: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_type == VREG || vp->v_type == VDIR); return (0); case _PC_ACCESS_FILTERING: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && vp->v_type == VDIR; return (0); case _PC_ACL_ENABLED: *valp = _ACL_ACE_ENABLED; return (0); #endif /* illumos */ case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); #ifdef illumos case _PC_TIMESTAMP_RESOLUTION: /* nanosecond timestamp resolution */ *valp = 1L; return (0); #endif case _PC_ACL_EXTENDED: *valp = 0; return (0); case _PC_ACL_NFS4: *valp = 1; return (0); case _PC_ACL_PATH_MAX: *valp = ACL_MAX_ENTRIES; return (0); default: return (EOPNOTSUPP); } } /*ARGSUSED*/ static int zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } static int zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zp->z_zfsvfs->z_os; rl_t *rl; vm_object_t object; off_t start, end, obj_size; uint_t blksz; int pgsin_b, pgsin_a; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); /* * Lock a range covering all required and optional pages. * Note that we need to handle the case of the block size growing. */ for (;;) { blksz = zp->z_blksz; rl = zfs_range_lock(zp, rounddown(start, blksz), roundup(end, blksz) - rounddown(start, blksz), RL_READER); if (blksz == zp->z_blksz) break; zfs_range_unlock(rl); } object = ma[0]->object; zfs_vmobject_wlock(object); obj_size = object->un_pager.vnp.vnp_size; zfs_vmobject_wunlock(object); if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (zfs_vm_pagerret_bad); } pgsin_b = 0; if (rbehind != NULL) { pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); pgsin_b = MIN(*rbehind, pgsin_b); } pgsin_a = 0; if (rahead != NULL) { pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); if (end + IDX_TO_OFF(pgsin_a) >= obj_size) pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); pgsin_a = MIN(*rahead, pgsin_a); } /* * NB: we need to pass the exact byte size of the data that we expect * to read after accounting for the file size. This is required because * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); if (error != 0) return (zfs_vm_pagerret_error); VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); if (rbehind != NULL) *rbehind = pgsin_b; if (rahead != NULL) *rahead = pgsin_a; return (zfs_vm_pagerret_ok); } static int zfs_freebsd_getpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; } */ *ap; { return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead)); } static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; rl_t *rl; dmu_tx_t *tx; struct sf_buf *sf; vm_object_t object; vm_page_t m; caddr_t va; size_t tocopy; size_t lo_len; vm_ooffset_t lo_off; vm_ooffset_t off; uint_t blksz; int ncount; int pcount; int err; int i; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); object = vp->v_object; pcount = btoc(len); ncount = pcount; KASSERT(ma[0]->object == object, ("mismatching object")); KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; lo_off = rounddown(off, blksz); lo_len = roundup(len + (off - lo_off), blksz); rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); zfs_vmobject_wlock(object); if (len + off > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > off) { int pgoff; len = object->un_pager.vnp.vnp_size - off; ncount = btoc(len); if ((pgoff = (int)len & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("zfs_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { len = 0; ncount = 0; } if (ncount < pcount) { for (i = ncount; i < pcount; i++) { rtvals[i] = zfs_vm_pagerret_bad; } } } zfs_vmobject_wunlock(object); if (ncount == 0) goto out; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { goto out; } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); goto out; } if (zp->z_blksz < PAGE_SIZE) { for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; va = zfs_map_page(ma[i], &sf); dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); zfs_unmap_page(sf); } } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { rtvals[i] = zfs_vm_pagerret_ok; vm_page_undirty(ma[i]); } zfs_vmobject_wunlock(object); VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); } dmu_tx_commit(tx); out: zfs_range_unlock(rl); if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (rtvals[0]); } int zfs_freebsd_putpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; } */ *ap; { return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals)); } static int zfs_freebsd_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } static int zfs_freebsd_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); if (error == 0) vnode_create_vobject(vp, zp->z_size, ap->a_td); return (error); } static int zfs_freebsd_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); } static int zfs_freebsd_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; } */ *ap; { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL, NULL)); } static int ioflags(int ioflags) { int flags = 0; if (ioflags & IO_APPEND) flags |= FAPPEND; if (ioflags & IO_NDELAY) flags |= FNONBLOCK; if (ioflags & IO_SYNC) flags |= (FSYNC | FDSYNC | FRSYNC); return (flags); } static int zfs_freebsd_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_access(ap) struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); accmode_t accmode; int error = 0; /* * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); /* * VADMIN has to be handled by vaccess(). */ if (error == 0) { accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) { error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred, NULL); } } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. */ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { error = EACCES; } return (error); } static int zfs_freebsd_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT(cnp->cn_namelen < sizeof(nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, cnp->cn_thread, 0)); } static int zfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; if (zfsvfs->z_use_namecache) return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap)); } static int zfs_freebsd_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { zfsvfs_t *zfsvfs; struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; int error, mode; ASSERT(cnp->cn_flags & SAVENAME); vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; zfsvfs = ap->a_dvp->v_mount->mnt_data; error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, ap->a_vpp, cnp->cn_cred, cnp->cn_thread); if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (error); } static int zfs_freebsd_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } static int zfs_freebsd_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { vattr_t *vap = ap->a_vap; ASSERT(ap->a_cnp->cn_flags & SAVENAME); vattr_init_mask(vap); return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, ap->a_cnp->cn_cred)); } static int zfs_freebsd_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } static int zfs_freebsd_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } static int zfs_freebsd_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { vop_stdfsync(ap); return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); } static int zfs_freebsd_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vattr_t *vap = ap->a_vap; xvattr_t xvap; u_long fflags = 0; int error; xva_init(&xvap); xvap.xva_vattr = *vap; xvap.xva_vattr.va_mask |= AT_XVATTR; /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ XVA_SET_REQ(&xvap, XAT_IMMUTABLE); XVA_SET_REQ(&xvap, XAT_APPENDONLY); XVA_SET_REQ(&xvap, XAT_NOUNLINK); XVA_SET_REQ(&xvap, XAT_NODUMP); XVA_SET_REQ(&xvap, XAT_READONLY); XVA_SET_REQ(&xvap, XAT_ARCHIVE); XVA_SET_REQ(&xvap, XAT_SYSTEM); XVA_SET_REQ(&xvap, XAT_HIDDEN); XVA_SET_REQ(&xvap, XAT_REPARSE); XVA_SET_REQ(&xvap, XAT_OFFLINE); XVA_SET_REQ(&xvap, XAT_SPARSE); error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); if (error != 0) return (error); /* Convert ZFS xattr into chflags. */ #define FLAG_CHECK(fflag, xflag, xfield) do { \ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ fflags |= (fflag); \ } while (0) FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHECK(UF_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHECK(UF_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHECK(UF_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHECK(UF_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; return (0); } static int zfs_freebsd_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; cred_t *cred = ap->a_cred; xvattr_t xvap; u_long fflags; uint64_t zflags; vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; xva_init(&xvap); xvap.xva_vattr = *vap; zflags = VTOZ(vp)->z_pflags; if (vap->va_flags != VNOVAL) { zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; int error; if (zfsvfs->z_use_fuids == B_FALSE) return (EOPNOTSUPP); fflags = vap->va_flags; /* * XXX KDM * We need to figure out whether it makes sense to allow * UF_REPARSE through, since we don't really have other * facilities to handle reparse points and zfs_setattr() * doesn't currently allow setting that attribute anyway. */ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| UF_OFFLINE|UF_SPARSE)) != 0) return (EOPNOTSUPP); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the PR_ALLOW_CHFLAGS permission bit is set; * otherwise, they behave like unprivileged processes. */ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { error = securelevel_gt(cred, 0); if (error != 0) return (error); } } else { /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) return (error); if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { return (EPERM); } if (fflags & (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { return (EPERM); } } #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ if (((fflags & (fflag)) && !(zflags & (zflag))) || \ ((zflags & (zflag)) && !(fflags & (fflag)))) { \ XVA_SET_REQ(&xvap, (xflag)); \ (xfield) = ((fflags & (fflag)) != 0); \ } \ } while (0) /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHANGE } if (vap->va_birthtime.tv_sec != VNOVAL) { xvap.xva_vattr.va_mask |= AT_XVATTR; XVA_SET_REQ(&xvap, XAT_CREATETIME); } return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); } static int zfs_freebsd_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } static int zfs_freebsd_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; ASSERT(cnp->cn_flags & SAVENAME); vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, __DECONST(char *, ap->a_target), cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); } static int zfs_freebsd_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; vnode_t *vp = ap->a_vp; vnode_t *tdvp = ap->a_tdvp; if (tdvp->v_mount != vp->v_mount) return (EXDEV); ASSERT(cnp->cn_flags & SAVENAME); return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); } static int zfs_freebsd_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfs_inactive(vp, ap->a_td->td_ucred, NULL); return (0); } static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp != NULL); - /* Destroy the vm object and flush associated pages. */ - vnode_destroy_vobject(vp); - /* * z_teardown_inactive_lock protects from a race with * zfs_znode_dmu_fini in zfsvfs_teardown during * force unmount. */ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); rw_exit(&zfsvfs->z_teardown_inactive_lock); vp->v_data = NULL; return (0); } static int zfs_freebsd_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); } static int zfs_freebsd_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); if (error == 0) { *ap->a_retval = val; return (error); } if (error != EOPNOTSUPP) return (error); switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { *ap->a_retval = PIPE_BUF; return (0); } return (EINVAL); default: return (vop_stdpathconf(ap)); } } /* * FreeBSD's extended attributes namespace defines file name prefix for ZFS' * extended attribute name: * * NAMESPACE PREFIX * system freebsd:system: * user (none, can be used to access ZFS fsattr(5) attributes * created on Solaris) */ static int zfs_create_attrname(int attrnamespace, const char *name, char *attrname, size_t size) { const char *namespace, *prefix, *suffix; /* We don't allow '/' character in attribute name. */ if (strchr(name, '/') != NULL) return (EINVAL); /* We don't allow attribute names that start with "freebsd:" string. */ if (strncmp(name, "freebsd:", 8) == 0) return (EINVAL); bzero(attrname, size); switch (attrnamespace) { case EXTATTR_NAMESPACE_USER: #if 0 prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_USER_STRING; suffix = ":"; #else /* * This is the default namespace by which we can access all * attributes created on Solaris. */ prefix = namespace = suffix = ""; #endif break; case EXTATTR_NAMESPACE_SYSTEM: prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; suffix = ":"; break; case EXTATTR_NAMESPACE_EMPTY: default: return (EINVAL); } if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, name) >= size) { return (ENAMETOOLONG); } return (0); } /* * Vnode operating to retrieve a named extended attribute. */ static int zfs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FREAD; NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); if (error == ENOENT) error = ENOATTR; return (error); } if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); if (error == 0) *ap->a_size = (size_t)va.va_size; } else if (ap->a_uio != NULL) error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to remove a named attribute. */ int zfs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, attrname, xvp, td); error = namei(&nd); vp = nd.ni_vp; if (error != 0) { ZFS_EXIT(zfsvfs); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ENOENT) error = ENOATTR; return (error); } error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to set a named attribute. */ static int zfs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR | CREATE_XATTR_DIR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FFLAGS(O_WRONLY | O_CREAT); NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } VATTR_NULL(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, ap->a_cred); if (error == 0) VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int zfs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrprefix[16]; u_char dirbuf[sizeof(struct dirent)]; struct dirent *dp; struct iovec aiov; struct uio auio, *uio = ap->a_uio; size_t *sizep = ap->a_size; size_t plen; vnode_t *xvp = NULL, *vp; int done, error, eof, pos; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, sizeof(attrprefix)); if (error != 0) return (error); plen = strlen(attrprefix); ZFS_ENTER(zfsvfs); if (sizep != NULL) *sizep = 0; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); /* * ENOATTR means that the EA directory does not yet exist, * i.e. there are no extended attributes there. */ if (error == ENOATTR) error = 0; return (error); } NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, ".", xvp, td); error = namei(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_rw = UIO_READ; auio.uio_offset = 0; do { u_char nlen; aiov.iov_base = (void *)dirbuf; aiov.iov_len = sizeof(dirbuf); auio.uio_resid = sizeof(dirbuf); error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); done = sizeof(dirbuf) - auio.uio_resid; if (error != 0) break; for (pos = 0; pos < done;) { dp = (struct dirent *)(dirbuf + pos); pos += dp->d_reclen; /* * XXX: Temporarily we also accept DT_UNKNOWN, as this * is what we get when attribute was created on Solaris. */ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) continue; if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) continue; else if (strncmp(dp->d_name, attrprefix, plen) != 0) continue; nlen = dp->d_namlen - plen; if (sizep != NULL) *sizep += 1 + nlen; else if (uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, uio->uio_rw, uio); if (error == 0) { error = uiomove(dp->d_name + plen, nlen, uio->uio_rw, uio); } if (error != 0) break; } } } while (!eof && error == 0); vput(vp); ZFS_EXIT(zfsvfs); return (error); } int zfs_freebsd_getacl(ap) struct vop_getacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); if (vsecattr.vsa_aclentp != NULL) kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); return (error); } int zfs_freebsd_setacl(ap) struct vop_setacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; int aclbsize; /* size of acl list in bytes */ aclent_t *aaclp; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); if (ap->a_aclp == NULL) return (EINVAL); if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) return (EINVAL); /* * With NFSv4 ACLs, chmod(2) may need to add additional entries, * splitting every entry into two and appending "canonical six" * entries at the end. Don't allow for setting an ACL that would * cause chmod(2) to run out of ACL entries. */ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) return (ENOSPC); error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); if (error != 0) return (error); vsecattr.vsa_mask = VSA_ACE; aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); aaclp = vsecattr.vsa_aclentp; vsecattr.vsa_aclentsz = aclbsize; aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); kmem_free(aaclp, aclbsize); return (error); } int zfs_freebsd_aclcheck(ap) struct vop_aclcheck_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { return (EOPNOTSUPP); } static int zfs_vptocnp(struct vop_vptocnp_args *ap) { vnode_t *covered_vp; vnode_t *vp = ap->a_vp;; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *zp = VTOZ(vp); int ltype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * If we are a snapshot mounted under .zfs, run the operation * on the covered vnode. */ if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { char name[MAXNAMLEN + 1]; znode_t *dzp; size_t len; error = zfs_znode_parent_and_name(zp, &dzp, name); if (error == 0) { len = strlen(name); if (*ap->a_buflen < len) error = SET_ERROR(ENOMEM); } if (error == 0) { *ap->a_buflen -= len; bcopy(name, ap->a_buf + *ap->a_buflen, len); *ap->a_vpp = ZTOV(dzp); } ZFS_EXIT(zfsvfs); return (error); } ZFS_EXIT(zfsvfs); covered_vp = vp->v_mount->mnt_vnodecovered; vhold(covered_vp); ltype = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); if (error == 0) { error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, ap->a_buf, ap->a_buflen); vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); if ((vp->v_iflag & VI_DOOMED) != 0) error = SET_ERROR(ENOENT); return (error); } #ifdef DIAGNOSTIC static int zfs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { vnode_t *vp; znode_t *zp; int err; err = vop_stdlock(ap); if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { vp = ap->a_vp; zp = vp->v_data; if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); } return (err); } #endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_access = zfs_freebsd_access, .vop_allocate = VOP_EINVAL, .vop_lookup = zfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_lookup, .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = zfs_freebsd_bmap, .vop_fid = zfs_freebsd_fid, .vop_getextattr = zfs_getextattr, .vop_deleteextattr = zfs_deleteextattr, .vop_setextattr = zfs_setextattr, .vop_listextattr = zfs_listextattr, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, .vop_vptocnp = zfs_vptocnp, #ifdef DIAGNOSTIC .vop_lock1 = zfs_lock, #endif }; struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = zfs_freebsd_fsync, .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_pathconf = zfs_freebsd_pathconf, .vop_fid = zfs_freebsd_fid, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, }; /* * special share hidden files vnode operations template */ struct vop_vector zfs_shareops = { .vop_default = &default_vnodeops, .vop_access = zfs_freebsd_access, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_fid = zfs_freebsd_fid, .vop_pathconf = zfs_freebsd_pathconf, }; diff --git a/sys/fs/cd9660/cd9660_node.c b/sys/fs/cd9660/cd9660_node.c index 17d70d869b79..c74bf4a3005c 100644 --- a/sys/fs/cd9660/cd9660_node.c +++ b/sys/fs/cd9660/cd9660_node.c @@ -1,331 +1,327 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1994, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_node.c 8.2 (Berkeley) 1/23/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include static unsigned cd9660_chars2ui(unsigned char *begin, int len); /* * Last reference to an inode, write the inode out and if necessary, * truncate and deallocate the file. */ int cd9660_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct iso_node *ip = VTOI(vp); int error = 0; /* * If we are done with the inode, reclaim it * so that it can be reused immediately. */ if (ip->inode.iso_mode == 0) vrecycle(vp); return error; } /* * Reclaim an inode so that it can be used for other purposes. */ int cd9660_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; - /* - * Destroy the vm object and flush associated pages. - */ - vnode_destroy_vobject(vp); /* * Remove the inode from its hash chain. */ vfs_hash_remove(vp); /* * Purge old data structures associated with the inode. */ free(vp->v_data, M_ISOFSNODE); vp->v_data = NULL; return (0); } /* * File attributes */ void cd9660_defattr(isodir, inop, bp, ftype) struct iso_directory_record *isodir; struct iso_node *inop; struct buf *bp; enum ISO_FTYPE ftype; { struct buf *bp2 = NULL; struct iso_mnt *imp; struct iso_extended_attributes *ap = NULL; int off; /* high sierra does not have timezone data, flag is one byte ahead */ if (isonum_711(ftype == ISO_FTYPE_HIGH_SIERRA? &isodir->date[6]: isodir->flags)&2) { inop->inode.iso_mode = S_IFDIR; /* * If we return 2, fts() will assume there are no subdirectories * (just links for the path and .), so instead we return 1. */ inop->inode.iso_links = 1; } else { inop->inode.iso_mode = S_IFREG; inop->inode.iso_links = 1; } if (!bp && ((imp = inop->i_mnt)->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) { cd9660_blkatoff(ITOV(inop), (off_t)-(off << imp->im_bshift), NULL, &bp2); bp = bp2; } if (bp) { ap = (struct iso_extended_attributes *)bp->b_data; if (isonum_711(ap->version) == 1) { if (!(ap->perm[0]&0x40)) inop->inode.iso_mode |= S_IXOTH; if (!(ap->perm[0]&0x10)) inop->inode.iso_mode |= S_IROTH; if (!(ap->perm[0]&4)) inop->inode.iso_mode |= S_IXGRP; if (!(ap->perm[0]&1)) inop->inode.iso_mode |= S_IRGRP; if (!(ap->perm[1]&0x40)) inop->inode.iso_mode |= S_IXUSR; if (!(ap->perm[1]&0x10)) inop->inode.iso_mode |= S_IRUSR; inop->inode.iso_uid = isonum_723(ap->owner); /* what about 0? */ inop->inode.iso_gid = isonum_723(ap->group); /* what about 0? */ } else ap = NULL; } if (!ap) { inop->inode.iso_mode |= S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; inop->inode.iso_uid = (uid_t)0; inop->inode.iso_gid = (gid_t)0; } if (bp2) brelse(bp2); } /* * Time stamps */ void cd9660_deftstamp(isodir,inop,bp,ftype) struct iso_directory_record *isodir; struct iso_node *inop; struct buf *bp; enum ISO_FTYPE ftype; { struct buf *bp2 = NULL; struct iso_mnt *imp; struct iso_extended_attributes *ap = NULL; int off; if (!bp && ((imp = inop->i_mnt)->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) { cd9660_blkatoff(ITOV(inop), (off_t)-(off << imp->im_bshift), NULL, &bp2); bp = bp2; } if (bp) { ap = (struct iso_extended_attributes *)bp->b_data; if (ftype != ISO_FTYPE_HIGH_SIERRA && isonum_711(ap->version) == 1) { if (!cd9660_tstamp_conv17(ap->ftime,&inop->inode.iso_atime)) cd9660_tstamp_conv17(ap->ctime,&inop->inode.iso_atime); if (!cd9660_tstamp_conv17(ap->ctime,&inop->inode.iso_ctime)) inop->inode.iso_ctime = inop->inode.iso_atime; if (!cd9660_tstamp_conv17(ap->mtime,&inop->inode.iso_mtime)) inop->inode.iso_mtime = inop->inode.iso_ctime; } else ap = NULL; } if (!ap) { cd9660_tstamp_conv7(isodir->date,&inop->inode.iso_ctime,ftype); inop->inode.iso_atime = inop->inode.iso_ctime; inop->inode.iso_mtime = inop->inode.iso_ctime; } if (bp2) brelse(bp2); } int cd9660_tstamp_conv7(pi,pu,ftype) u_char *pi; struct timespec *pu; enum ISO_FTYPE ftype; { int crtime, days; int y, m, d, hour, minute, second, tz; y = pi[0] + 1900; m = pi[1]; d = pi[2]; hour = pi[3]; minute = pi[4]; second = pi[5]; if(ftype != ISO_FTYPE_HIGH_SIERRA) tz = ((signed char *)pi)[6]; /* Timezone value is signed. */ else /* original high sierra misses timezone data */ tz = 0; if (y < 1970) { pu->tv_sec = 0; pu->tv_nsec = 0; return 0; } else { #ifdef ORIGINAL /* computes day number relative to Sept. 19th,1989 */ /* don't even *THINK* about changing formula. It works! */ days = 367*(y-1980)-7*(y+(m+9)/12)/4-3*((y+(m-9)/7)/100+1)/4+275*m/9+d-100; #else /* * Changed :-) to make it relative to Jan. 1st, 1970 * and to disambiguate negative division */ days = 367*(y-1960)-7*(y+(m+9)/12)/4-3*((y+(m+9)/12-1)/100+1)/4+275*m/9+d-239; #endif crtime = ((((days * 24) + hour) * 60 + minute) * 60) + second; /* timezone offset is unreliable on some disks */ if (-48 <= tz && tz <= 52) crtime -= tz * 15 * 60; } pu->tv_sec = crtime; pu->tv_nsec = 0; return 1; } static u_int cd9660_chars2ui(begin,len) u_char *begin; int len; { u_int rc; for (rc = 0; --len >= 0;) { rc *= 10; rc += *begin++ - '0'; } return rc; } int cd9660_tstamp_conv17(pi,pu) u_char *pi; struct timespec *pu; { u_char buf[7]; /* year:"0001"-"9999" -> -1900 */ buf[0] = cd9660_chars2ui(pi,4) - 1900; /* month: " 1"-"12" -> 1 - 12 */ buf[1] = cd9660_chars2ui(pi + 4,2); /* day: " 1"-"31" -> 1 - 31 */ buf[2] = cd9660_chars2ui(pi + 6,2); /* hour: " 0"-"23" -> 0 - 23 */ buf[3] = cd9660_chars2ui(pi + 8,2); /* minute:" 0"-"59" -> 0 - 59 */ buf[4] = cd9660_chars2ui(pi + 10,2); /* second:" 0"-"59" -> 0 - 59 */ buf[5] = cd9660_chars2ui(pi + 12,2); /* difference of GMT */ buf[6] = pi[16]; return cd9660_tstamp_conv7(buf, pu, ISO_FTYPE_DEFAULT); } cd_ino_t isodirino(isodir, imp) struct iso_directory_record *isodir; struct iso_mnt *imp; { cd_ino_t ino; /* * Note there is an inverse calculation in * cd9660_vfsops.c:cd9660_vget_internal(): * ip->iso_start = ino >> imp->im_bshift; * and also a calculation of the isodir pointer * from an inode in cd9660_vnops.c:cd9660_readlink() */ ino = ((cd_ino_t)isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length)) << imp->im_bshift; return ino; } diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index 590334f1615b..ab6e7e13b8ba 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -1,1996 +1,1995 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 * * $FreeBSD$ */ /* * TODO: * mkdir: want it ? */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct vop_vector devfs_vnodeops; static struct vop_vector devfs_specops; static struct fileops devfs_ops_f; #include #include #include #include #include #include static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data"); struct mtx devfs_de_interlock; MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF); struct sx clone_drain_lock; SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock"); struct mtx cdevpriv_mtx; MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF); SYSCTL_DECL(_vfs_devfs); static int devfs_dotimes; SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW, &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision"); /* * Update devfs node timestamp. Note that updates are unlocked and * stat(2) could see partially updated times. */ static void devfs_timestamp(struct timespec *tsp) { time_t ts; if (devfs_dotimes) { vfs_timestamp(tsp); } else { ts = time_second; if (tsp->tv_sec != ts) { tsp->tv_sec = ts; tsp->tv_nsec = 0; } } } static int devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp, int *ref) { *dswp = devvn_refthread(fp->f_vnode, devp, ref); if (*devp != fp->f_data) { if (*dswp != NULL) dev_relthread(*devp, *ref); return (ENXIO); } KASSERT((*devp)->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp))); if (*dswp == NULL) return (ENXIO); curthread->td_fpop = fp; return (0); } int devfs_get_cdevpriv(void **datap) { struct file *fp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (EBADF); p = fp->f_cdevpriv; if (p != NULL) { error = 0; *datap = p->cdpd_data; } else error = ENOENT; return (error); } int devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr) { struct file *fp; struct cdev_priv *cdp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (ENOENT); cdp = cdev2priv((struct cdev *)fp->f_data); p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK); p->cdpd_data = priv; p->cdpd_dtr = priv_dtr; p->cdpd_fp = fp; mtx_lock(&cdevpriv_mtx); if (fp->f_cdevpriv == NULL) { LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list); fp->f_cdevpriv = p; mtx_unlock(&cdevpriv_mtx); error = 0; } else { mtx_unlock(&cdevpriv_mtx); free(p, M_CDEVPDATA); error = EBUSY; } return (error); } void devfs_destroy_cdevpriv(struct cdev_privdata *p) { mtx_assert(&cdevpriv_mtx, MA_OWNED); KASSERT(p->cdpd_fp->f_cdevpriv == p, ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p)); p->cdpd_fp->f_cdevpriv = NULL; LIST_REMOVE(p, cdpd_list); mtx_unlock(&cdevpriv_mtx); (p->cdpd_dtr)(p->cdpd_data); free(p, M_CDEVPDATA); } static void devfs_fpdrop(struct file *fp) { struct cdev_privdata *p; mtx_lock(&cdevpriv_mtx); if ((p = fp->f_cdevpriv) == NULL) { mtx_unlock(&cdevpriv_mtx); return; } devfs_destroy_cdevpriv(p); } void devfs_clear_cdevpriv(void) { struct file *fp; fp = curthread->td_fpop; if (fp == NULL) return; devfs_fpdrop(fp); } /* * On success devfs_populate_vp() returns with dmp->dm_lock held. */ static int devfs_populate_vp(struct vnode *vp) { struct devfs_dirent *de; struct devfs_mount *dmp; int locked; ASSERT_VOP_LOCKED(vp, "devfs_populate_vp"); dmp = VFSTODEVFS(vp->v_mount); locked = VOP_ISLOCKED(vp); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); /* Can't call devfs_populate() with the vnode lock held. */ VOP_UNLOCK(vp, 0); devfs_populate(dmp); sx_xunlock(&dmp->dm_lock); vn_lock(vp, locked | LK_RETRY); sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ERESTART); } if ((vp->v_iflag & VI_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } de = vp->v_data; KASSERT(de != NULL, ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed")); if ((de->de_flags & DE_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } return (0); } static int devfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct devfs_mount *dmp; char *buf = ap->a_buf; int *buflen = ap->a_buflen; struct devfs_dirent *dd, *de; int i, error; dmp = VFSTODEVFS(vp->v_mount); error = devfs_populate_vp(vp); if (error != 0) return (error); i = *buflen; dd = vp->v_data; if (vp->v_type == VCHR) { i -= strlen(dd->de_cdp->cdp_c.si_name); if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_cdp->cdp_c.si_name, buf + i, strlen(dd->de_cdp->cdp_c.si_name)); de = dd->de_dir; } else if (vp->v_type == VDIR) { if (dd == dmp->dm_rootdir) { *dvp = vp; vref(*dvp); goto finished; } i -= dd->de_dirent->d_namlen; if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen); de = dd; } else { error = ENOENT; goto finished; } *buflen = i; de = devfs_parent_dirent(de); if (de == NULL) { error = ENOENT; goto finished; } mtx_lock(&devfs_de_interlock); *dvp = de->de_vnode; if (*dvp != NULL) { VI_LOCK(*dvp); mtx_unlock(&devfs_de_interlock); vholdl(*dvp); VI_UNLOCK(*dvp); vref(*dvp); vdrop(*dvp); } else { mtx_unlock(&devfs_de_interlock); error = ENOENT; } finished: sx_xunlock(&dmp->dm_lock); return (error); } /* * Construct the fully qualified path name relative to the mountpoint. * If a NULL cnp is provided, no '/' is appended to the resulting path. */ char * devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd, struct componentname *cnp) { int i; struct devfs_dirent *de; sx_assert(&dmp->dm_lock, SA_LOCKED); i = SPECNAMELEN; buf[i] = '\0'; if (cnp != NULL) i -= cnp->cn_namelen; if (i < 0) return (NULL); if (cnp != NULL) bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_rootdir) { if (cnp != NULL || i < SPECNAMELEN) { i--; if (i < 0) return (NULL); buf[i] = '/'; } i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = devfs_parent_dirent(de); if (de == NULL) return (NULL); } return (buf + i); } static int devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp, struct devfs_dirent *de) { int not_found; not_found = 0; if (de->de_flags & DE_DOOMED) not_found = 1; if (DEVFS_DE_DROP(de)) { KASSERT(not_found == 1, ("DEVFS de dropped but not doomed")); devfs_dirent_free(de); } if (DEVFS_DMP_DROP(dmp)) { KASSERT(not_found == 1, ("DEVFS mount struct freed before dirent")); not_found = 2; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } if (not_found == 1 || (drop_dm_lock && not_found != 2)) sx_unlock(&dmp->dm_lock); return (not_found); } static void devfs_insmntque_dtr(struct vnode *vp, void *arg) { struct devfs_dirent *de; de = (struct devfs_dirent *)arg; mtx_lock(&devfs_de_interlock); vp->v_data = NULL; de->de_vnode = NULL; mtx_unlock(&devfs_de_interlock); vgone(vp); vput(vp); } /* * devfs_allocv shall be entered with dmp->dm_lock held, and it drops * it on return. */ int devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode, struct vnode **vpp) { int error; struct vnode *vp; struct cdev *dev; struct devfs_mount *dmp; struct cdevsw *dsw; dmp = VFSTODEVFS(mp); if (de->de_flags & DE_DOOMED) { sx_xunlock(&dmp->dm_lock); return (ENOENT); } loop: DEVFS_DE_HOLD(de); DEVFS_DMP_HOLD(dmp); mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { VI_LOCK(vp); mtx_unlock(&devfs_de_interlock); sx_xunlock(&dmp->dm_lock); vget(vp, lockmode | LK_INTERLOCK | LK_RETRY, curthread); sx_xlock(&dmp->dm_lock); if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } else if ((vp->v_iflag & VI_DOOMED) != 0) { mtx_lock(&devfs_de_interlock); if (de->de_vnode == vp) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vput(vp); goto loop; } sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } mtx_unlock(&devfs_de_interlock); if (de->de_dirent->d_type == DT_CHR) { if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) { devfs_allocv_drop_refs(1, dmp, de); return (ENOENT); } dev = &de->de_cdp->cdp_c; } else { dev = NULL; } error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp); if (error != 0) { devfs_allocv_drop_refs(1, dmp, de); printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; VI_LOCK(vp); dev_lock(); dev_refl(dev); /* XXX: v_rdev should be protect by vnode lock */ vp->v_rdev = dev; KASSERT(vp->v_usecount == 1, ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount)); dev->si_usecount += vp->v_usecount; /* Special casing of ttys for deadfs. Probably redundant. */ dsw = dev->si_devsw; if (dsw != NULL && (dsw->d_flags & D_TTY) != 0) vp->v_vflag |= VV_ISTTY; dev_unlock(); VI_UNLOCK(vp); if ((dev->si_flags & SI_ETERNAL) != 0) vp->v_vflag |= VV_ETERNALDEV; vp->v_op = &devfs_specops; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS); VN_LOCK_ASHARE(vp); mtx_lock(&devfs_de_interlock); vp->v_data = de; de->de_vnode = vp; mtx_unlock(&devfs_de_interlock); error = insmntque1(vp, mp, devfs_insmntque_dtr, de); if (error != 0) { (void) devfs_allocv_drop_refs(1, dmp, de); return (error); } if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } #ifdef MAC mac_devfs_vnode_associate(mp, de, vp); #endif sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } static int devfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct proc *p; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_accmode, ap->a_cred, NULL); if (error == 0) return (0); if (error != EACCES) return (error); p = ap->a_td->td_proc; /* We do, however, allow access to the controlling terminal */ PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == de->de_cdp) error = 0; PROC_UNLOCK(p); return (error); } _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0, "devfs-only flag reuse failed"); static int devfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct proc *p; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; int dflags, error, ref, vp_locked; /* * XXX: Don't call d_close() if we were called because of * XXX: insmntque1() failure. */ if (vp->v_data == NULL) return (0); /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (td != NULL) { p = td->td_proc; PROC_LOCK(p); if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; sx_xlock(&proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); VI_LOCK(vp); if (count_dev(dev) == 2 && (vp->v_iflag & VI_DOOMED) == 0) { p->p_session->s_ttyvp = NULL; p->p_session->s_ttydp = NULL; oldvp = vp; } VI_UNLOCK(vp); SESS_UNLOCK(p->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) vrele(oldvp); } else PROC_UNLOCK(p); } /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); dflags = 0; VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { /* Forced close. */ dflags |= FREVOKE | FNONBLOCK; } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if (count_dev(dev) > 1) { VI_UNLOCK(vp); dev_relthread(dev, ref); return (0); } if (count_dev(dev) == 1) dflags |= FLASTCLOSE; vholdl(vp); VI_UNLOCK(vp); vp_locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td); dev_relthread(dev, ref); vn_lock(vp, vp_locked | LK_RETRY); vdrop(vp); return (error); } static int devfs_close_f(struct file *fp, struct thread *td) { int error; struct file *fpop; /* * NB: td may be NULL if this descriptor is closed due to * garbage collection from a closed UNIX domain socket. */ fpop = curthread->td_fpop; curthread->td_fpop = fp; error = vnops.fo_close(fp, td); curthread->td_fpop = fpop; /* * The f_cdevpriv cannot be assigned non-NULL value while we * are destroying the file. */ if (fp->f_cdevpriv != NULL) devfs_fpdrop(fp); return (error); } static int devfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct devfs_dirent *de; struct devfs_mount *dmp; struct cdev *dev; struct timeval boottime; int error; error = devfs_populate_vp(vp); if (error != 0) return (error); dmp = VFSTODEVFS(vp->v_mount); sx_xunlock(&dmp->dm_lock); de = vp->v_data; KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp)); if (vp->v_type == VDIR) { de = de->de_dir; KASSERT(de != NULL, ("Null dir dirent in devfs_getattr vp=%p", vp)); } vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; getboottime(&boottime); #define fix(aa) \ do { \ if ((aa).tv_sec <= 3600) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = cdev2priv(dev)->cdp_inode; } vap->va_gen = 0; vap->va_flags = 0; vap->va_filerev = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* ARGSUSED */ static int devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct file *fpop; int error; fpop = td->td_fpop; td->td_fpop = fp; error = vnops.fo_ioctl(fp, com, data, cred, td); td->td_fpop = fpop; return (error); } void * fiodgname_buf_get_ptr(void *fgnp, u_long com) { union { struct fiodgname_arg fgn; #ifdef COMPAT_FREEBSD32 struct fiodgname_arg32 fgn32; #endif } *fgnup; fgnup = fgnp; switch (com) { case FIODGNAME: return (fgnup->fgn.buf); #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: return ((void *)(uintptr_t)fgnup->fgn32.buf); #endif default: panic("Unhandled ioctl command %ld", com); } } static int devfs_ioctl(struct vop_ioctl_args *ap) { struct fiodgname_arg *fgn; struct vnode *vpold, *vp; struct cdevsw *dsw; struct thread *td; struct cdev *dev; int error, ref, i; const char *p; u_long com; vp = ap->a_vp; com = ap->a_command; td = ap->a_td; dsw = devvn_refthread(vp, &dev, &ref); if (dsw == NULL) return (ENXIO); KASSERT(dev->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(dev))); switch (com) { case FIODTYPE: *(int *)ap->a_data = dsw->d_flags & D_TYPEMASK; error = 0; break; case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif fgn = ap->a_data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i); break; default: error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); } dev_relthread(dev, ref); if (error == ENOIOCTL) error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { /* Do nothing if reassigning same control tty */ sx_slock(&proctree_lock); if (td->td_proc->p_session->s_ttyvp == vp) { sx_sunlock(&proctree_lock); return (0); } vpold = td->td_proc->p_session->s_ttyvp; VREF(vp); SESS_LOCK(td->td_proc->p_session); td->td_proc->p_session->s_ttyvp = vp; td->td_proc->p_session->s_ttydp = cdev2priv(dev); SESS_UNLOCK(td->td_proc->p_session); sx_sunlock(&proctree_lock); /* Get rid of reference to old control tty */ if (vpold) vrele(vpold); } return (error); } /* ARGSUSED */ static int devfs_kqfilter_f(struct file *fp, struct knote *kn) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; struct thread *td; td = curthread; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error) return (error); error = dsw->d_kqfilter(dev, kn); td->td_fpop = fpop; dev_relthread(dev, ref); return (error); } static inline int devfs_prison_check(struct devfs_dirent *de, struct thread *td) { struct cdev_priv *cdp; struct ucred *dcr; struct proc *p; int error; cdp = de->de_cdp; if (cdp == NULL) return (0); dcr = cdp->cdp_c.si_cred; if (dcr == NULL) return (0); error = prison_check(td->td_ucred, dcr); if (error == 0) return (0); /* We do, however, allow access to the controlling terminal */ p = td->td_proc; PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == cdp) error = 0; PROC_UNLOCK(p); return (error); } static int devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct mount *mp; struct cdev *cdev; int error, flags, nameiop, dvplocked; char specname[SPECNAMELEN + 1], *pname; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; mp = dvp->v_mount; dmp = VFSTODEVFS(mp); dd = dvp->v_data; *vpp = NULLVP; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); de = devfs_parent_dirent(dd); if (de == NULL) return (ENOENT); dvplocked = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; vn_lock(dvp, dvplocked | LK_RETRY); return (error); } dd = dvp->v_data; de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0); while (de == NULL) { /* While(...) so we can use break */ if (nameiop == DELETE) return (ENOENT); /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dmp, dd, cnp); if (pname == NULL) break; cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); sx_slock(&clone_drain_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); sx_sunlock(&clone_drain_lock); if (cdev == NULL) sx_xlock(&dmp->dm_lock); else if (devfs_populate_vp(dvp) != 0) { *dm_unlock = 0; sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } else sx_xunlock(&dmp->dm_lock); dev_rel(cdev); return (ENOENT); } if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); if (cdev != NULL) dev_rel(cdev); return (ENOENT); } if (cdev == NULL) break; dev_lock(); dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx]; if (dde != NULL && *dde != NULL) de = *dde; dev_unlock(); dev_rel(cdev); break; } if (de == NULL || de->de_flags & DE_WHITEOUT) { if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } if (devfs_prison_check(de, td)) return (ENOENT); if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; return (error); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; int dm_unlock; if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOTDIR); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dm_unlock = 1; j = devfs_lookupx(ap, &dm_unlock); if (dm_unlock == 1) sx_xunlock(&dmp->dm_lock); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; /* * The only type of node we should be creating here is a * character device, for anything else return EOPNOTSUPP. */ if (ap->a_vap->va_type != VCHR) return (EOPNOTSUPP); dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); cnp = ap->a_cnp; vpp = ap->a_vpp; dd = dvp->v_data; error = ENOENT; sx_xlock(&dmp->dm_lock); TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (de->de_dirent->d_type == DT_CHR && (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp); return (error); notfound: sx_xunlock(&dmp->dm_lock); return (error); } /* ARGSUSED */ static int devfs_open(struct vop_open_args *ap) { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; struct file *fp = ap->a_fp; int error, ref, vlocked; struct cdevsw *dsw; struct file *fpop; struct mtx *mtxp; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); if (fp == NULL && dsw->d_fdopen != NULL) { dev_relthread(dev, ref); return (ENXIO); } vlocked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); fpop = td->td_fpop; td->td_fpop = fp; if (fp != NULL) { fp->f_data = dev; fp->f_vnode = vp; } if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); /* Clean up any cdevpriv upon error. */ if (error != 0) devfs_clear_cdevpriv(); td->td_fpop = fpop; vn_lock(vp, vlocked | LK_RETRY); dev_relthread(dev, ref); if (error != 0) { if (error == ERESTART) error = EINTR; return (error); } #if 0 /* /dev/console */ KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp")); #else if (fp == NULL) return (error); #endif if (fp->f_ops == &badfileops) finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f); mtxp = mtx_pool_find(mtxpool_sleep, fp); /* * Hint to the dofilewrite() to not force the buffer draining * on the writer to the file. Most likely, the write would * not need normal buffers. */ mtx_lock(mtxp); fp->f_vnread_flags |= FDEVFS_VNODE; mtx_unlock(mtxp); return (error); } static int devfs_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = INT_MAX; return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_MAX_CANON: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_CANON; return (0); } return (EINVAL); case _PC_MAX_INPUT: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_INPUT; return (0); } return (EINVAL); case _PC_VDISABLE: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = _POSIX_VDISABLE; return (0); } return (EINVAL); case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistant label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_poll(fp, events, cred, td); return (error); } error = dsw->d_poll(dev, events, td); td->td_fpop = fpop; dev_relthread(dev, ref); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(struct vop_print_args *ap) { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } static int devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int ioflag, error, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_read(fp, uio, cred, flags, td); return (error); } resid = uio->uio_resid; ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); error = dsw->d_read(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) devfs_timestamp(&dev->si_atime); td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF); return (error); } static int devfs_readdir(struct vop_readdir_args *ap) { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off; int *tmp_ncookies = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * XXX: This is a temporary hack to get around this filesystem not * supporting cookies. We store the location of the ncookies pointer * in a temporary variable before calling vfs_subr.c:vfs_read_dirent() * and set the number of cookies to 0. We then set the pointer to * NULL so that vfs_read_dirent doesn't try to call realloc() on * ap->a_cookies. Later in this function, we restore the ap->a_ncookies * pointer to its original location before returning to the caller. */ if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } dmp = VFSTODEVFS(ap->a_vp->v_mount); if (devfs_populate_vp(ap->a_vp) != 0) { if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (EIO); } error = 0; de = ap->a_vp->v_data; off = 0; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); if (dd->de_flags & (DE_COVERED | DE_WHITEOUT)) continue; if (devfs_prison_check(dd, uio->uio_td)) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp)); if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; /* NOTE: d_off is the offset for the *next* entry. */ dp->d_off = off + dp->d_reclen; if (off >= uio->uio_offset) { error = vfs_read_dirent(ap, dp, off); if (error) break; } off += dp->d_reclen; } sx_xunlock(&dmp->dm_lock); uio->uio_offset = off; /* * Restore ap->a_ncookies if it wasn't originally NULL in the first * place. */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } static int devfs_readlink(struct vop_readlink_args *ap) { struct devfs_dirent *de; de = ap->a_vp->v_data; return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio)); } static int devfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; mtx_lock(&devfs_de_interlock); de = vp->v_data; if (de != NULL) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); - vnode_destroy_vobject(vp); return (0); } static int devfs_reclaim_vchr(struct vop_reclaim_args *ap) { struct vnode *vp; struct cdev *dev; vp = ap->a_vp; MPASS(vp->v_type == VCHR); devfs_reclaim(ap); VI_LOCK(vp); dev_lock(); dev = vp->v_rdev; vp->v_rdev = NULL; if (dev != NULL) dev->si_usecount -= vp->v_usecount; dev_unlock(); VI_UNLOCK(vp); if (dev != NULL) dev_rel(dev); return (0); } static int devfs_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); ASSERT_VOP_ELOCKED(dvp, "devfs_remove"); ASSERT_VOP_ELOCKED(vp, "devfs_remove"); sx_xlock(&dmp->dm_lock); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_cdp == NULL) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_dirent->d_type == DT_LNK) { de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) de_covered->de_flags &= ~DE_COVERED; } /* We need to unlock dvp because devfs_delete() may lock it. */ VOP_UNLOCK(vp, 0); if (dvp != vp) VOP_UNLOCK(dvp, 0); devfs_delete(dmp, de, 0); sx_xunlock(&dmp->dm_lock); if (dvp != vp) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { de->de_flags |= DE_WHITEOUT; sx_xunlock(&dmp->dm_lock); } return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. * */ static int devfs_revoke(struct vop_revoke_args *ap) { struct vnode *vp = ap->a_vp, *vp2; struct cdev *dev; struct cdev_priv *cdp; struct devfs_dirent *de; u_int i; KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL")); dev = vp->v_rdev; cdp = cdev2priv(dev); dev_lock(); cdp->cdp_inuse++; dev_unlock(); vhold(vp); vgone(vp); vdrop(vp); VOP_UNLOCK(vp,0); loop: for (;;) { mtx_lock(&devfs_de_interlock); dev_lock(); vp2 = NULL; for (i = 0; i <= cdp->cdp_maxdirent; i++) { de = cdp->cdp_dirents[i]; if (de == NULL) continue; vp2 = de->de_vnode; if (vp2 != NULL) { dev_unlock(); VI_LOCK(vp2); mtx_unlock(&devfs_de_interlock); if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) goto loop; vhold(vp2); vgone(vp2); vdrop(vp2); vput(vp2); break; } } if (vp2 != NULL) { continue; } dev_unlock(); mtx_unlock(&devfs_de_interlock); break; } dev_lock(); cdp->cdp_inuse--; if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) { TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); } else dev_unlock(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (0); } static int devfs_rioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct devfs_mount *dmp; int error; vp = ap->a_vp; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); return (EBADF); } dmp = VFSTODEVFS(vp->v_mount); sx_xlock(&dmp->dm_lock); VOP_UNLOCK(vp, 0); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td); sx_xunlock(&dmp->dm_lock); return (error); } static int devfs_rread(struct vop_read_args *ap) { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(struct vop_setattr_args *ap) { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; td = curthread; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } error = devfs_populate_vp(vp); if (error != 0) return (error); de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) { error = priv_check(td, PRIV_VFS_CHOWN); if (error != 0) goto ret; } de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != de->de_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto ret; } de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { error = vn_utimes_perm(vp, vap, ap->a_cred, td); if (error != 0) goto ret; if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } ret: sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock); return (error); } #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_vnode_relabel(ap->a_cred, vp, ap->a_label); mac_devfs_update(vp->v_mount, de, vp); return (0); } #endif static int devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) { return (vnops.fo_stat(fp, sb, cred, td)); } static int devfs_symlink(struct vop_symlink_args *ap) { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered, *de_dotdot; struct devfs_mount *dmp; error = priv_check(curthread, PRIV_DEVFS_SYMLINK); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOENT); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_flags = DE_USER; de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = alloc_unr(devfs_inos); de->de_dir = dd; de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; de->de_symlink = malloc(i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); #ifdef MAC mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) { if ((de_covered->de_flags & DE_USER) != 0) { devfs_delete(dmp, de, DEVFS_DEL_NORECURSE); sx_xunlock(&dmp->dm_lock); return (EEXIST); } KASSERT((de_covered->de_flags & DE_COVERED) == 0, ("devfs_symlink: entry %p already covered", de_covered)); de_covered->de_flags |= DE_COVERED; } de_dotdot = TAILQ_FIRST(&dd->de_dlist); /* "." */ de_dotdot = TAILQ_NEXT(de_dotdot, de_list); /* ".." */ TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list); devfs_dir_ref_de(dmp, dd); devfs_rules_apply(dmp, de); return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp)); } static int devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { return (vnops.fo_truncate(fp, length, cred, td)); } static int devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int error, ioflag, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_write(fp, uio, cred, flags, td); return (error); } KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); resid = uio->uio_resid; error = dsw->d_write(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { devfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF); return (error); } static int devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; struct mount *mp; struct vnode *vp; struct file *fpop; vm_object_t object; vm_prot_t maxprot; int error, ref; vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. * * Note that most character devices always share mappings. * The one exception is that D_MMAP_ANON devices * (i.e. /dev/zero) permit private writable mappings. * * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests * as well as updating maxprot to permit writing for * D_MMAP_ANON devices rather than doing that here. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } maxprot &= cap_maxprot; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) return (error); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff, &object); td->td_fpop = fpop; dev_relthread(dev, ref); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); return (error); } dev_t dev2udev(struct cdev *x) { if (x == NULL) return (NODEV); return (cdev2priv(x)->cdp_inode); } static struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_truncate = devfs_truncate_f, .fo_ioctl = devfs_ioctl_f, .fo_poll = devfs_poll_f, .fo_kqfilter = devfs_kqfilter_f, .fo_stat = devfs_stat_f, .fo_close = devfs_close_f, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = devfs_mmap_f, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; /* Vops for non-CHR vnodes in /dev. */ static struct vop_vector devfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_rioctl, .vop_lookup = devfs_lookup, .vop_mknod = devfs_mknod, .vop_pathconf = devfs_pathconf, .vop_read = devfs_rread, .vop_readdir = devfs_readdir, .vop_readlink = devfs_readlink, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_revoke = devfs_revoke, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_symlink = devfs_symlink, .vop_vptocnp = devfs_vptocnp, }; /* Vops for VCHR vnodes in /dev. */ static struct vop_vector devfs_specops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_bmap = VOP_PANIC, .vop_close = devfs_close, .vop_create = VOP_PANIC, .vop_fsync = vop_stdfsync, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_ioctl, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = devfs_open, .vop_pathconf = devfs_pathconf, .vop_poll = dead_poll, .vop_print = devfs_print, .vop_read = dead_read, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = devfs_reclaim_vchr, .vop_remove = devfs_remove, .vop_rename = VOP_PANIC, .vop_revoke = devfs_revoke, .vop_rmdir = VOP_PANIC, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_strategy = VOP_PANIC, .vop_symlink = VOP_PANIC, .vop_vptocnp = devfs_vptocnp, .vop_write = dead_write, }; /* * Our calling convention to the device drivers used to be that we passed * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ * flags instead since that's what open(), close() and ioctl() takes and * we don't really want vnode.h in device drivers. * We solved the source compatibility by redefining some vnode flags to * be the same as the fcntl ones and by sending down the bitwise OR of * the respective fcntl/vnode flags. These CTASSERTS make sure nobody * pulls the rug out under this. */ CTASSERT(O_NONBLOCK == IO_NDELAY); CTASSERT(O_FSYNC == IO_SYNC); diff --git a/sys/fs/ext2fs/ext2_inode.c b/sys/fs/ext2fs/ext2_inode.c index 4f0de4b6f5c9..d5e05cbe4747 100644 --- a/sys/fs/ext2fs/ext2_inode.c +++ b/sys/fs/ext2fs/ext2_inode.c @@ -1,644 +1,643 @@ /*- * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_inode.c 8.5 (Berkeley) 12/30/93 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Update the access, modified, and inode change times as specified by the * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. Write the inode * to disk if the IN_MODIFIED flag is set (it may be set initially, or by * the timestamp update). The IN_LAZYMOD flag is set to force a write * later if not now. If we write now, then clear both IN_MODIFIED and * IN_LAZYMOD to reflect the presumably successful write, and if waitfor is * set, then wait for the write to complete. */ int ext2_update(struct vnode *vp, int waitfor) { struct m_ext2fs *fs; struct buf *bp; struct inode *ip; int error; ASSERT_VOP_ELOCKED(vp, "ext2_update"); ext2_itimes(vp); ip = VTOI(vp); if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0) return (0); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); fs = ip->i_e2fs; if (fs->e2fs_ronly) return (0); if ((error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) { brelse(bp); return (error); } error = ext2_i2ei(ip, (struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number))); if (error) { brelse(bp); return (error); } if (waitfor && !DOINGASYNC(vp)) return (bwrite(bp)); else { bdwrite(bp); return (0); } } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Release blocks associated with the inode ip and stored in the indirect * block bn. Blocks are free'd in LIFO order up to (but not including) * lastbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * NB: triple indirect blocks are untested. */ static int ext2_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn, int level, e4fs_daddr_t *countp) { struct buf *bp; struct m_ext2fs *fs = ip->i_e2fs; struct vnode *vp; e2fs_daddr_t *bap, *copy; int i, nblocks, error = 0, allerror = 0; e2fs_lbn_t nb, nlbn, last; e4fs_daddr_t blkcount, factor, blocksreleased = 0; /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = 1; for (i = SINGLE; i < level; i++) factor *= NINDIR(fs); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->e2fs_bsize); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to bmap on these blocks will fail. However, we already have * the on disk address, so we have to set the b_blkno field * explicitly instead of letting bread do everything for us. */ vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->e2fs_bsize, 0, 0, 0); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { bp->b_iocmd = BIO_READ; if (bp->b_bcount > bp->b_bufsize) panic("ext2_indirtrunc: bad buffer size"); bp->b_blkno = dbn; vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); error = bufwait(bp); } if (error) { brelse(bp); *countp = 0; return (error); } bap = (e2fs_daddr_t *)bp->b_data; copy = malloc(fs->e2fs_bsize, M_TEMP, M_WAITOK); bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->e2fs_bsize); bzero((caddr_t)&bap[last + 1], (NINDIR(fs) - (last + 1)) * sizeof(e2fs_daddr_t)); if (last == -1) bp->b_flags |= B_INVAL; if (DOINGASYNC(vp)) { bdwrite(bp); } else { error = bwrite(bp); if (error) allerror = error; } bap = copy; /* * Recursively free totally unused blocks. */ for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = bap[i]; if (nb == 0) continue; if (level > SINGLE) { if ((error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb), (int32_t)-1, level - 1, &blkcount)) != 0) allerror = error; blocksreleased += blkcount; } ext2_blkfree(ip, nb, fs->e2fs_bsize); blocksreleased += nblocks; } /* * Recursively free last partial block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = bap[i]; if (nb != 0) { if ((error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb), last, level - 1, &blkcount)) != 0) allerror = error; blocksreleased += blkcount; } } free(copy, M_TEMP); *countp = blocksreleased; return (allerror); } /* * Truncate the inode oip to at most length size, freeing the * disk blocks. */ static int ext2_ind_truncate(struct vnode *vp, off_t length, int flags, struct ucred *cred, struct thread *td) { struct vnode *ovp = vp; e4fs_daddr_t lastblock; struct inode *oip; e4fs_daddr_t bn, lbn, lastiblock[EXT2_NIADDR], indir_lbn[EXT2_NIADDR]; uint32_t oldblks[EXT2_NDADDR + EXT2_NIADDR]; uint32_t newblks[EXT2_NDADDR + EXT2_NIADDR]; struct m_ext2fs *fs; struct buf *bp; int offset, size, level; e4fs_daddr_t count, nblocks, blocksreleased = 0; int error, i, allerror; off_t osize; #ifdef INVARIANTS struct bufobj *bo; #endif oip = VTOI(ovp); #ifdef INVARIANTS bo = &ovp->v_bufobj; #endif fs = oip->i_e2fs; osize = oip->i_size; /* * Lengthen the size of the file. We must ensure that the * last byte of the file is allocated. Since the smallest * value of osize is 0, length will be at least 1. */ if (osize < length) { if (length > oip->i_e2fs->e2fs_maxfilesize) return (EFBIG); vnode_pager_setsize(ovp, length); offset = blkoff(fs, length - 1); lbn = lblkno(fs, length - 1); flags |= BA_CLRBUF; error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, flags); if (error) { vnode_pager_setsize(vp, osize); return (error); } oip->i_size = length; if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(ovp, !DOINGASYNC(ovp))); } /* * Shorten the size of the file. If the file is not being * truncated to a block boundary, the contents of the * partial block following the end of the file must be * zero'ed in case it ever become accessible again because * of subsequent file growth. */ /* I don't understand the comment above */ offset = blkoff(fs, length); if (offset == 0) { oip->i_size = length; } else { lbn = lblkno(fs, length); flags |= BA_CLRBUF; error = ext2_balloc(oip, lbn, offset, cred, &bp, flags); if (error) return (error); oip->i_size = length; size = blksize(fs, oip, lbn); bzero((char *)bp->b_data + offset, (u_int)(size - offset)); allocbuf(bp, size); if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); } /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = lblkno(fs, length + fs->e2fs_bsize - 1) - 1; lastiblock[SINGLE] = lastblock - EXT2_NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); nblocks = btodb(fs->e2fs_bsize); /* * Update file and block pointers on disk before we start freeing * blocks. If we crash before free'ing blocks below, the blocks * will be returned to the free list. lastiblock values are also * normalized to -1 for calls to ext2_indirtrunc below. */ for (level = TRIPLE; level >= SINGLE; level--) { oldblks[EXT2_NDADDR + level] = oip->i_ib[level]; if (lastiblock[level] < 0) { oip->i_ib[level] = 0; lastiblock[level] = -1; } } for (i = 0; i < EXT2_NDADDR; i++) { oldblks[i] = oip->i_db[i]; if (i > lastblock) oip->i_db[i] = 0; } oip->i_flag |= IN_CHANGE | IN_UPDATE; allerror = ext2_update(ovp, !DOINGASYNC(ovp)); /* * Having written the new inode to disk, save its new configuration * and put back the old block pointers long enough to process them. * Note that we save the new block configuration so we can check it * when we are done. */ for (i = 0; i < EXT2_NDADDR; i++) { newblks[i] = oip->i_db[i]; oip->i_db[i] = oldblks[i]; } for (i = 0; i < EXT2_NIADDR; i++) { newblks[EXT2_NDADDR + i] = oip->i_ib[i]; oip->i_ib[i] = oldblks[EXT2_NDADDR + i]; } oip->i_size = osize; error = vtruncbuf(ovp, length, (int)fs->e2fs_bsize); if (error && (allerror == 0)) allerror = error; vnode_pager_setsize(ovp, length); /* * Indirect blocks first. */ indir_lbn[SINGLE] = -EXT2_NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; for (level = TRIPLE; level >= SINGLE; level--) { bn = oip->i_ib[level]; if (bn != 0) { error = ext2_indirtrunc(oip, indir_lbn[level], fsbtodb(fs, bn), lastiblock[level], level, &count); if (error) allerror = error; blocksreleased += count; if (lastiblock[level] < 0) { oip->i_ib[level] = 0; ext2_blkfree(oip, bn, fs->e2fs_fsize); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ for (i = EXT2_NDADDR - 1; i > lastblock; i--) { long bsize; bn = oip->i_db[i]; if (bn == 0) continue; oip->i_db[i] = 0; bsize = blksize(fs, oip, i); ext2_blkfree(oip, bn, bsize); blocksreleased += btodb(bsize); } if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = oip->i_db[lastblock]; if (bn != 0) { long oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = blksize(fs, oip, lastblock); oip->i_size = length; newspace = blksize(fs, oip, lastblock); if (newspace == 0) panic("ext2_truncate: newspace"); if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += numfrags(fs, newspace); ext2_blkfree(oip, bn, oldspace - newspace); blocksreleased += btodb(oldspace - newspace); } } done: #ifdef INVARIANTS for (level = SINGLE; level <= TRIPLE; level++) if (newblks[EXT2_NDADDR + level] != oip->i_ib[level]) panic("itrunc1"); for (i = 0; i < EXT2_NDADDR; i++) if (newblks[i] != oip->i_db[i]) panic("itrunc2"); BO_LOCK(bo); if (length == 0 && (bo->bo_dirty.bv_cnt != 0 || bo->bo_clean.bv_cnt != 0)) panic("itrunc3"); BO_UNLOCK(bo); #endif /* INVARIANTS */ /* * Put back the real size. */ oip->i_size = length; if (oip->i_blocks >= blocksreleased) oip->i_blocks -= blocksreleased; else /* sanity */ oip->i_blocks = 0; oip->i_flag |= IN_CHANGE; vnode_pager_setsize(ovp, length); return (allerror); } static int ext2_ext_truncate(struct vnode *vp, off_t length, int flags, struct ucred *cred, struct thread *td) { struct vnode *ovp = vp; int32_t lastblock; struct m_ext2fs *fs; struct inode *oip; struct buf *bp; uint32_t lbn, offset; int error, size; off_t osize; oip = VTOI(ovp); fs = oip->i_e2fs; osize = oip->i_size; if (osize < length) { if (length > oip->i_e2fs->e2fs_maxfilesize) { return (EFBIG); } vnode_pager_setsize(ovp, length); offset = blkoff(fs, length - 1); lbn = lblkno(fs, length - 1); flags |= BA_CLRBUF; error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, flags); if (error) { vnode_pager_setsize(vp, osize); return (error); } oip->i_size = length; if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(ovp, !DOINGASYNC(ovp))); } lastblock = (length + fs->e2fs_bsize - 1) / fs->e2fs_bsize; error = ext4_ext_remove_space(oip, lastblock, flags, cred, td); if (error) return (error); offset = blkoff(fs, length); if (offset == 0) { oip->i_size = length; } else { lbn = lblkno(fs, length); flags |= BA_CLRBUF; error = ext2_balloc(oip, lbn, offset, cred, &bp, flags); if (error) { return (error); } oip->i_size = length; size = blksize(fs, oip, lbn); bzero((char *)bp->b_data + offset, (u_int)(size - offset)); allocbuf(bp, size); if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); } oip->i_size = osize; error = vtruncbuf(ovp, length, (int)fs->e2fs_bsize); if (error) return (error); vnode_pager_setsize(ovp, length); oip->i_size = length; oip->i_flag |= IN_CHANGE | IN_UPDATE; error = ext2_update(ovp, !DOINGASYNC(ovp)); return (error); } /* * Truncate the inode ip to at most length size, freeing the * disk blocks. */ int ext2_truncate(struct vnode *vp, off_t length, int flags, struct ucred *cred, struct thread *td) { struct inode *ip; int error; ASSERT_VOP_LOCKED(vp, "ext2_truncate"); if (length < 0) return (EINVAL); ip = VTOI(vp); if (vp->v_type == VLNK && ip->i_size < vp->v_mount->mnt_maxsymlinklen) { #ifdef INVARIANTS if (length != 0) panic("ext2_truncate: partial truncate of symlink"); #endif bzero((char *)&ip->i_shortlink, (u_int)ip->i_size); ip->i_size = 0; ip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(vp, 1)); } if (ip->i_size == length) { ip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(vp, 0)); } if (ip->i_flag & IN_E4EXTENTS) error = ext2_ext_truncate(vp, length, flags, cred, td); else error = ext2_ind_truncate(vp, length, flags, cred, td); return (error); } /* * discard preallocated blocks */ int ext2_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct thread *td = ap->a_td; int mode, error = 0; /* * Ignore inodes related to stale file handles. */ if (ip->i_mode == 0) goto out; if (ip->i_nlink <= 0) { ext2_extattr_free(ip); error = ext2_truncate(vp, (off_t)0, 0, NOCRED, td); if (!(ip->i_flag & IN_E4EXTENTS)) ip->i_rdev = 0; mode = ip->i_mode; ip->i_mode = 0; ip->i_flag |= IN_CHANGE | IN_UPDATE; ext2_vfree(vp, ip->i_number, mode); } if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) ext2_update(vp, 0); out: /* * If we are done with the inode, reclaim it * so that it can be reused immediately. */ if (ip->i_mode == 0) vrecycle(vp); return (error); } /* * Reclaim an inode so that it can be used for other purposes. */ int ext2_reclaim(struct vop_reclaim_args *ap) { struct inode *ip; struct vnode *vp = ap->a_vp; ip = VTOI(vp); if (ip->i_flag & IN_LAZYMOD) { ip->i_flag |= IN_MODIFIED; ext2_update(vp, 0); } vfs_hash_remove(vp); free(vp->v_data, M_EXT2NODE); vp->v_data = 0; - vnode_destroy_vobject(vp); return (0); } diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 537f1b734c29..c886527b480a 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -1,2494 +1,2493 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Copyright (c) 2019 The FreeBSD Foundation * * Portions of this software were developed by BFF Storage Systems, LLC under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_io.h" #include /* Maximum number of hardlinks to a single FUSE file */ #define FUSE_LINK_MAX UINT32_MAX SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*"); /* vnode ops */ static vop_access_t fuse_vnop_access; static vop_advlock_t fuse_vnop_advlock; static vop_bmap_t fuse_vnop_bmap; static vop_close_t fuse_fifo_close; static vop_close_t fuse_vnop_close; static vop_create_t fuse_vnop_create; static vop_deleteextattr_t fuse_vnop_deleteextattr; static vop_fdatasync_t fuse_vnop_fdatasync; static vop_fsync_t fuse_vnop_fsync; static vop_getattr_t fuse_vnop_getattr; static vop_getextattr_t fuse_vnop_getextattr; static vop_inactive_t fuse_vnop_inactive; static vop_link_t fuse_vnop_link; static vop_listextattr_t fuse_vnop_listextattr; static vop_lookup_t fuse_vnop_lookup; static vop_mkdir_t fuse_vnop_mkdir; static vop_mknod_t fuse_vnop_mknod; static vop_open_t fuse_vnop_open; static vop_pathconf_t fuse_vnop_pathconf; static vop_read_t fuse_vnop_read; static vop_readdir_t fuse_vnop_readdir; static vop_readlink_t fuse_vnop_readlink; static vop_reclaim_t fuse_vnop_reclaim; static vop_remove_t fuse_vnop_remove; static vop_rename_t fuse_vnop_rename; static vop_rmdir_t fuse_vnop_rmdir; static vop_setattr_t fuse_vnop_setattr; static vop_setextattr_t fuse_vnop_setextattr; static vop_strategy_t fuse_vnop_strategy; static vop_symlink_t fuse_vnop_symlink; static vop_write_t fuse_vnop_write; static vop_getpages_t fuse_vnop_getpages; static vop_print_t fuse_vnop_print; static vop_vptofh_t fuse_vnop_vptofh; struct vop_vector fuse_fifoops = { .vop_default = &fifo_specops, .vop_access = fuse_vnop_access, .vop_close = fuse_fifo_close, .vop_fsync = fuse_vnop_fsync, .vop_getattr = fuse_vnop_getattr, .vop_inactive = fuse_vnop_inactive, .vop_pathconf = fuse_vnop_pathconf, .vop_print = fuse_vnop_print, .vop_read = VOP_PANIC, .vop_reclaim = fuse_vnop_reclaim, .vop_setattr = fuse_vnop_setattr, .vop_write = VOP_PANIC, .vop_vptofh = fuse_vnop_vptofh, }; struct vop_vector fuse_vnops = { .vop_allocate = VOP_EINVAL, .vop_default = &default_vnodeops, .vop_access = fuse_vnop_access, .vop_advlock = fuse_vnop_advlock, .vop_bmap = fuse_vnop_bmap, .vop_close = fuse_vnop_close, .vop_create = fuse_vnop_create, .vop_deleteextattr = fuse_vnop_deleteextattr, .vop_fsync = fuse_vnop_fsync, .vop_fdatasync = fuse_vnop_fdatasync, .vop_getattr = fuse_vnop_getattr, .vop_getextattr = fuse_vnop_getextattr, .vop_inactive = fuse_vnop_inactive, /* * TODO: implement vop_ioctl after upgrading to protocol 7.16. * FUSE_IOCTL was added in 7.11, but 32-bit compat is broken until * 7.16. */ .vop_link = fuse_vnop_link, .vop_listextattr = fuse_vnop_listextattr, .vop_lookup = fuse_vnop_lookup, .vop_mkdir = fuse_vnop_mkdir, .vop_mknod = fuse_vnop_mknod, .vop_open = fuse_vnop_open, .vop_pathconf = fuse_vnop_pathconf, /* * TODO: implement vop_poll after upgrading to protocol 7.21. * FUSE_POLL was added in protocol 7.11, but it's kind of broken until * 7.21, which adds the ability for the client to choose which poll * events it wants, and for a client to deregister a file handle */ .vop_read = fuse_vnop_read, .vop_readdir = fuse_vnop_readdir, .vop_readlink = fuse_vnop_readlink, .vop_reclaim = fuse_vnop_reclaim, .vop_remove = fuse_vnop_remove, .vop_rename = fuse_vnop_rename, .vop_rmdir = fuse_vnop_rmdir, .vop_setattr = fuse_vnop_setattr, .vop_setextattr = fuse_vnop_setextattr, .vop_strategy = fuse_vnop_strategy, .vop_symlink = fuse_vnop_symlink, .vop_write = fuse_vnop_write, .vop_getpages = fuse_vnop_getpages, .vop_print = fuse_vnop_print, .vop_vptofh = fuse_vnop_vptofh, }; uma_zone_t fuse_pbuf_zone; #define fuse_vm_page_lock(m) vm_page_lock((m)); #define fuse_vm_page_unlock(m) vm_page_unlock((m)); #define fuse_vm_page_lock_queues() ((void)0) #define fuse_vm_page_unlock_queues() ((void)0) /* Check permission for extattr operations, much like extattr_check_cred */ static int fuse_extattr_check_cred(struct vnode *vp, int ns, struct ucred *cred, struct thread *td, accmode_t accmode) { struct mount *mp = vnode_mount(vp); struct fuse_data *data = fuse_get_mpdata(mp); /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (ns) { case EXTATTR_NAMESPACE_SYSTEM: if (data->dataflags & FSESS_DEFAULT_PERMISSIONS) { return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); } /* FALLTHROUGH */ case EXTATTR_NAMESPACE_USER: return (fuse_internal_access(vp, accmode, td, cred)); default: return (EPERM); } } /* Get a filehandle for a directory */ static int fuse_filehandle_get_dir(struct vnode *vp, struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid) { if (fuse_filehandle_get(vp, FREAD, fufhp, cred, pid) == 0) return 0; return fuse_filehandle_get(vp, FEXEC, fufhp, cred, pid); } /* Send FUSE_FLUSH for this vnode */ static int fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) { struct fuse_flush_in *ffi; struct fuse_filehandle *fufh; struct fuse_dispatcher fdi; struct thread *td = curthread; struct mount *mp = vnode_mount(vp); int err; if (!fsess_isimpl(vnode_mount(vp), FUSE_FLUSH)) return 0; err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); if (err) return err; fdisp_init(&fdi, sizeof(*ffi)); fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred); ffi = fdi.indata; ffi->fh = fufh->fh_id; /* * If the file has a POSIX lock then we're supposed to set lock_owner. * If not, then lock_owner is undefined. So we may as well always set * it. */ ffi->lock_owner = td->td_proc->p_pid; err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_FLUSH); err = 0; } fdisp_destroy(&fdi); return err; } /* Close wrapper for fifos. */ static int fuse_fifo_close(struct vop_close_args *ap) { return (fifo_specops.vop_close(ap)); } /* struct vnop_access_args { struct vnode *a_vp; #if VOP_ACCESS_TAKES_ACCMODE_T accmode_t a_accmode; #else int a_mode; #endif struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; int accmode = ap->a_accmode; struct ucred *cred = ap->a_cred; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); int err; if (fuse_isdeadfs(vp)) { if (vnode_isvroot(vp)) { return 0; } return ENXIO; } if (!(data->dataflags & FSESS_INITED)) { if (vnode_isvroot(vp)) { if (priv_check_cred(cred, PRIV_VFS_ADMIN) || (fuse_match_cred(data->daemoncred, cred) == 0)) { return 0; } } return EBADF; } if (vnode_islnk(vp)) { return 0; } err = fuse_internal_access(vp, accmode, ap->a_td, ap->a_cred); return err; } /* * struct vop_advlock_args { * struct vop_generic_args a_gen; * struct vnode *a_vp; * void *a_id; * int a_op; * struct flock *a_fl; * int a_flags; * } */ static int fuse_vnop_advlock(struct vop_advlock_args *ap) { struct vnode *vp = ap->a_vp; struct flock *fl = ap->a_fl; struct thread *td = curthread; struct ucred *cred = td->td_ucred; pid_t pid = td->td_proc->p_pid; struct fuse_filehandle *fufh; struct fuse_dispatcher fdi; struct fuse_lk_in *fli; struct fuse_lk_out *flo; enum fuse_opcode op; int dataflags, err; int flags = ap->a_flags; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; if (fuse_isdeadfs(vp)) { return ENXIO; } if (!(dataflags & FSESS_POSIX_LOCKS)) return vop_stdadvlock(ap); /* FUSE doesn't properly support flock until protocol 7.17 */ if (flags & F_FLOCK) return vop_stdadvlock(ap); err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid); if (err) return err; fdisp_init(&fdi, sizeof(*fli)); switch(ap->a_op) { case F_GETLK: op = FUSE_GETLK; break; case F_SETLK: op = FUSE_SETLK; break; case F_SETLKW: op = FUSE_SETLKW; break; default: return EINVAL; } fdisp_make_vp(&fdi, op, vp, td, cred); fli = fdi.indata; fli->fh = fufh->fh_id; fli->owner = fl->l_pid; fli->lk.start = fl->l_start; if (fl->l_len != 0) fli->lk.end = fl->l_start + fl->l_len - 1; else fli->lk.end = INT64_MAX; fli->lk.type = fl->l_type; fli->lk.pid = fl->l_pid; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); if (err == 0 && op == FUSE_GETLK) { flo = fdi.answ; fl->l_type = flo->lk.type; fl->l_pid = flo->lk.pid; if (flo->lk.type != F_UNLCK) { fl->l_start = flo->lk.start; if (flo->lk.end == INT64_MAX) fl->l_len = 0; else fl->l_len = flo->lk.end - flo->lk.start + 1; fl->l_start = flo->lk.start; } } return err; } /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ static int fuse_vnop_bmap(struct vop_bmap_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj **bo = ap->a_bop; struct thread *td = curthread; struct mount *mp; struct fuse_dispatcher fdi; struct fuse_bmap_in *fbi; struct fuse_bmap_out *fbo; struct fuse_data *data; uint64_t biosize; off_t filesize; daddr_t lbn = ap->a_bn; daddr_t *pbn = ap->a_bnp; int *runp = ap->a_runp; int *runb = ap->a_runb; int error = 0; int maxrun; if (fuse_isdeadfs(vp)) { return ENXIO; } mp = vnode_mount(vp); data = fuse_get_mpdata(mp); biosize = fuse_iosize(vp); maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1, data->max_readahead_blocks); if (bo != NULL) *bo = &vp->v_bufobj; /* * The FUSE_BMAP operation does not include the runp and runb * variables, so we must guess. Report nonzero contiguous runs so * cluster_read will combine adjacent reads. It's worthwhile to reduce * upcalls even if we don't know the true physical layout of the file. * * FUSE file systems may opt out of read clustering in two ways: * * mounting with -onoclusterr * * Setting max_readahead <= maxbcachebuf during FUSE_INIT */ if (runb != NULL) *runb = MIN(lbn, maxrun); if (runp != NULL) { error = fuse_vnode_size(vp, &filesize, td->td_ucred, td); if (error == 0) *runp = MIN(MAX(0, filesize / biosize - lbn - 1), maxrun); else *runp = 0; } if (fsess_isimpl(mp, FUSE_BMAP)) { fdisp_init(&fdi, sizeof(*fbi)); fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred); fbi = fdi.indata; fbi->block = lbn; fbi->blocksize = biosize; error = fdisp_wait_answ(&fdi); if (error == ENOSYS) { fdisp_destroy(&fdi); fsess_set_notimpl(mp, FUSE_BMAP); error = 0; } else { fbo = fdi.answ; if (error == 0 && pbn != NULL) *pbn = fbo->block; fdisp_destroy(&fdi); return error; } } /* If the daemon doesn't support BMAP, make up a sensible default */ if (pbn != NULL) *pbn = lbn * btodb(biosize); return (error); } /* struct vop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct ucred *cred = ap->a_cred; int fflag = ap->a_fflag; struct thread *td = ap->a_td; pid_t pid = td->td_proc->p_pid; int err = 0; if (fuse_isdeadfs(vp)) return 0; if (vnode_isdir(vp)) return 0; if (fflag & IO_NDELAY) return 0; err = fuse_flush(vp, cred, pid, fflag); /* TODO: close the file handle, if we're sure it's no longer used */ if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, cred, td->td_proc->p_pid); } return err; } static void fdisp_make_mknod_for_fallback( struct fuse_dispatcher *fdip, struct componentname *cnp, struct vnode *dvp, uint64_t parentnid, struct thread *td, struct ucred *cred, mode_t mode, enum fuse_opcode *op) { struct fuse_mknod_in *fmni; fdisp_init(fdip, sizeof(*fmni) + cnp->cn_namelen + 1); *op = FUSE_MKNOD; fdisp_make(fdip, *op, vnode_mount(dvp), parentnid, td, cred); fmni = fdip->indata; fmni->mode = mode; fmni->rdev = 0; memcpy((char *)fdip->indata + sizeof(*fmni), cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[sizeof(*fmni) + cnp->cn_namelen] = '\0'; } /* struct vnop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; struct fuse_data *data; struct fuse_create_in *fci; struct fuse_entry_out *feo; struct fuse_open_out *foo; struct fuse_dispatcher fdi, fdi2; struct fuse_dispatcher *fdip = &fdi; struct fuse_dispatcher *fdip2 = NULL; int err; struct mount *mp = vnode_mount(dvp); data = fuse_get_mpdata(mp); uint64_t parentnid = VTOFUD(dvp)->nid; mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode); enum fuse_opcode op; int flags; if (fuse_isdeadfs(dvp)) return ENXIO; /* FUSE expects sockets to be created with FUSE_MKNOD */ if (vap->va_type == VSOCK) return fuse_internal_mknod(dvp, vpp, cnp, vap); /* * VOP_CREATE doesn't tell us the open(2) flags, so we guess. Only a * writable mode makes sense, and we might as well include readability * too. */ flags = O_RDWR; bzero(&fdi, sizeof(fdi)); if (vap->va_type != VREG) return (EINVAL); if (!fsess_isimpl(mp, FUSE_CREATE) || vap->va_type == VSOCK) { /* Fallback to FUSE_MKNOD/FUSE_OPEN */ fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td, cred, mode, &op); } else { /* Use FUSE_CREATE */ size_t insize; op = FUSE_CREATE; fdisp_init(fdip, sizeof(*fci) + cnp->cn_namelen + 1); fdisp_make(fdip, op, vnode_mount(dvp), parentnid, td, cred); fci = fdip->indata; fci->mode = mode; fci->flags = O_CREAT | flags; if (fuse_libabi_geq(data, 7, 12)) { insize = sizeof(*fci); fci->umask = td->td_proc->p_fd->fd_cmask; } else { insize = sizeof(struct fuse_open_in); } memcpy((char *)fdip->indata + insize, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[insize + cnp->cn_namelen] = '\0'; } err = fdisp_wait_answ(fdip); if (err) { if (err == ENOSYS && op == FUSE_CREATE) { fsess_set_notimpl(mp, FUSE_CREATE); fdisp_destroy(fdip); fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td, cred, mode, &op); err = fdisp_wait_answ(fdip); } if (err) goto out; } feo = fdip->answ; if ((err = fuse_internal_checkentry(feo, vap->va_type))) { goto out; } if (op == FUSE_CREATE) { foo = (struct fuse_open_out*)(feo + 1); } else { /* Issue a separate FUSE_OPEN */ struct fuse_open_in *foi; fdip2 = &fdi2; fdisp_init(fdip2, sizeof(*foi)); fdisp_make(fdip2, FUSE_OPEN, vnode_mount(dvp), feo->nodeid, td, cred); foi = fdip2->indata; foi->flags = flags; err = fdisp_wait_answ(fdip2); if (err) goto out; foo = fdip2->answ; } err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vap->va_type); if (err) { struct fuse_release_in *fri; uint64_t nodeid = feo->nodeid; uint64_t fh_id = foo->fh; fdisp_init(fdip, sizeof(*fri)); fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred); fri = fdip->indata; fri->fh = fh_id; fri->flags = flags; fuse_insert_callback(fdip->tick, fuse_internal_forget_callback); fuse_insert_message(fdip->tick, false); goto out; } ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create"); fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, td, cred, foo); fuse_vnode_open(*vpp, foo->open_flags, td); /* * Purge the parent's attribute cache because the daemon should've * updated its mtime and ctime */ fuse_vnode_clear_attr_cache(dvp); cache_purge_negative(dvp); out: if (fdip2) fdisp_destroy(fdip2); fdisp_destroy(fdip); return err; } /* struct vnop_fdatasync_args { struct vop_generic_args a_gen; struct vnode * a_vp; struct thread * a_td; }; */ static int fuse_vnop_fdatasync(struct vop_fdatasync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; int waitfor = MNT_WAIT; int err = 0; if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfdatasync_buf(ap))) return err; return fuse_internal_fsync(vp, td, waitfor, true); } /* struct vnop_fsync_args { struct vop_generic_args a_gen; struct vnode * a_vp; int a_waitfor; struct thread * a_td; }; */ static int fuse_vnop_fsync(struct vop_fsync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; int waitfor = ap->a_waitfor; int err = 0; if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfsync(ap))) return err; return fuse_internal_fsync(vp, td, waitfor, false); } /* struct vnop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; int err = 0; int dataflags; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; /* Note that we are not bailing out on a dead file system just yet. */ if (!(dataflags & FSESS_INITED)) { if (!vnode_isvroot(vp)) { fdata_set_dead(fuse_get_mpdata(vnode_mount(vp))); err = ENOTCONN; return err; } else { goto fake; } } err = fuse_internal_getattr(vp, vap, cred, td); if (err == ENOTCONN && vnode_isvroot(vp)) { /* see comment in fuse_vfsop_statfs() */ goto fake; } else { return err; } fake: bzero(vap, sizeof(*vap)); vap->va_type = vnode_vtype(vp); return 0; } /* struct vnop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; */ static int fuse_vnop_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh, *fufh_tmp; int need_flush = 1; LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { if (need_flush && vp->v_type == VREG) { if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, NULL, 0); } if ((fvdat->flag & FN_REVOKED) != 0) fuse_io_invalbuf(vp, td); else fuse_io_flushbuf(vp, MNT_WAIT, td); need_flush = 0; } fuse_filehandle_close(vp, fufh, td, NULL); } if ((fvdat->flag & FN_REVOKED) != 0) vrecycle(vp); return 0; } /* struct vnop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = VTOVA(vp); struct fuse_dispatcher fdi; struct fuse_entry_out *feo; struct fuse_link_in fli; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_mount(tdvp) != vnode_mount(vp)) { return EXDEV; } /* * This is a seatbelt check to protect naive userspace filesystems from * themselves and the limitations of the FUSE IPC protocol. If a * filesystem does not allow attribute caching, assume it is capable of * validating that nlink does not overflow. */ if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX) return EMLINK; fli.oldnodeid = VTOI(vp); fdisp_init(&fdi, 0); fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp, FUSE_LINK, &fli, sizeof(fli), &fdi); if ((err = fdisp_wait_answ(&fdi))) { goto out; } feo = fdi.answ; err = fuse_internal_checkentry(feo, vnode_vtype(vp)); if (!err) { /* * Purge the parent's attribute cache because the daemon * should've updated its mtime and ctime */ fuse_vnode_clear_attr_cache(tdvp); fuse_internal_cache_attrs(vp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); } out: fdisp_destroy(&fdi); return err; } struct fuse_lookup_alloc_arg { struct fuse_entry_out *feo; struct componentname *cnp; uint64_t nid; enum vtype vtyp; }; /* Callback for vn_get_ino */ static int fuse_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { struct fuse_lookup_alloc_arg *flaa = arg; return fuse_vnode_get(mp, flaa->feo, flaa->nid, NULL, vpp, flaa->cnp, flaa->vtyp); } SDT_PROBE_DEFINE3(fusefs, , vnops, cache_lookup, "int", "struct timespec*", "struct timespec*"); /* struct vnop_lookup_args { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; */ int fuse_vnop_lookup(struct vop_lookup_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; int nameiop = cnp->cn_nameiop; int flags = cnp->cn_flags; int wantparent = flags & (LOCKPARENT | WANTPARENT); int islastcn = flags & ISLASTCN; struct mount *mp = vnode_mount(dvp); int err = 0; int lookup_err = 0; struct vnode *vp = NULL; struct fuse_dispatcher fdi; bool did_lookup = false; struct fuse_entry_out *feo = NULL; enum vtype vtyp; /* vnode type of target */ off_t filesize; /* filesize of target */ uint64_t nid; if (fuse_isdeadfs(dvp)) { *vpp = NULL; return ENXIO; } if (!vnode_isdir(dvp)) return ENOTDIR; if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) return EROFS; if ((err = fuse_internal_access(dvp, VEXEC, td, cred))) return err; if (flags & ISDOTDOT) { KASSERT(VTOFUD(dvp)->flag & FN_PARENT_NID, ("Looking up .. is TODO")); nid = VTOFUD(dvp)->parent_nid; if (nid == 0) return ENOENT; /* .. is obviously a directory */ vtyp = VDIR; filesize = 0; } else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') { nid = VTOI(dvp); /* . is obviously a directory */ vtyp = VDIR; filesize = 0; } else { struct timespec now, timeout; err = cache_lookup(dvp, vpp, cnp, &timeout, NULL); getnanouptime(&now); SDT_PROBE3(fusefs, , vnops, cache_lookup, err, &timeout, &now); switch (err) { case -1: /* positive match */ if (timespeccmp(&timeout, &now, >)) { counter_u64_add(fuse_lookup_cache_hits, 1); } else { /* Cache timeout */ counter_u64_add(fuse_lookup_cache_misses, 1); bintime_clear( &VTOFUD(*vpp)->entry_cache_timeout); cache_purge(*vpp); if (dvp != *vpp) vput(*vpp); else vrele(*vpp); *vpp = NULL; break; } return 0; case 0: /* no match in cache */ counter_u64_add(fuse_lookup_cache_misses, 1); break; case ENOENT: /* negative match */ getnanouptime(&now); if (timespeccmp(&timeout, &now, <=)) { /* Cache timeout */ cache_purge_negative(dvp); break; } /* fall through */ default: return err; } nid = VTOI(dvp); fdisp_init(&fdi, cnp->cn_namelen + 1); fdisp_make(&fdi, FUSE_LOOKUP, mp, nid, td, cred); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; lookup_err = fdisp_wait_answ(&fdi); did_lookup = true; if (!lookup_err) { /* lookup call succeeded */ feo = (struct fuse_entry_out *)fdi.answ; nid = feo->nodeid; if (nid == 0) { /* zero nodeid means ENOENT and cache it */ struct timespec timeout; fdi.answ_stat = ENOENT; lookup_err = ENOENT; if (cnp->cn_flags & MAKEENTRY) { fuse_validity_2_timespec(feo, &timeout); cache_enter_time(dvp, *vpp, cnp, &timeout, NULL); } } else if (nid == FUSE_ROOT_ID) { lookup_err = EINVAL; } vtyp = IFTOVT(feo->attr.mode); filesize = feo->attr.size; } if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT)) { fdisp_destroy(&fdi); return lookup_err; } } /* lookup_err, if non-zero, must be ENOENT at this point */ if (lookup_err) { /* Entry not found */ if ((nameiop == CREATE || nameiop == RENAME) && islastcn) { err = fuse_internal_access(dvp, VWRITE, td, cred); if (!err) { /* * Set the SAVENAME flag to hold onto the * pathname for use later in VOP_CREATE or * VOP_RENAME. */ cnp->cn_flags |= SAVENAME; err = EJUSTRETURN; } } else { err = ENOENT; } } else { /* Entry was found */ if (flags & ISDOTDOT) { struct fuse_lookup_alloc_arg flaa; flaa.nid = nid; flaa.feo = feo; flaa.cnp = cnp; flaa.vtyp = vtyp; err = vn_vget_ino_gen(dvp, fuse_lookup_alloc, &flaa, 0, &vp); *vpp = vp; } else if (nid == VTOI(dvp)) { vref(dvp); *vpp = dvp; } else { struct fuse_vnode_data *fvdat; err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp, &vp, cnp, vtyp); if (err) goto out; *vpp = vp; /* * In the case where we are looking up a FUSE node * represented by an existing cached vnode, and the * true size reported by FUSE_LOOKUP doesn't match * the vnode's cached size, then any cached writes * beyond the file's current size are lost. * * We can get here: * * following attribute cache expiration, or * * due a bug in the daemon, or */ fvdat = VTOFUD(vp); if (vnode_isreg(vp) && filesize != fvdat->cached_attrs.va_size && fvdat->flag & FN_SIZECHANGE) { /* * The FN_SIZECHANGE flag reflects a dirty * append. If userspace lets us know our cache * is invalid, that write was lost. (Dirty * writes that do not cause append are also * lost, but we don't detect them here.) * * XXX: Maybe disable WB caching on this mount. */ printf("%s: WB cache incoherent on %s!\n", __func__, vnode_mount(vp)->mnt_stat.f_mntonname); fvdat->flag &= ~FN_SIZECHANGE; } MPASS(feo != NULL); fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec, &fvdat->entry_cache_timeout); if ((nameiop == DELETE || nameiop == RENAME) && islastcn) { struct vattr dvattr; err = fuse_internal_access(dvp, VWRITE, td, cred); if (err != 0) goto out; /* * if the parent's sticky bit is set, check * whether we're allowed to remove the file. * Need to figure out the vnode locking to make * this work. */ fuse_internal_getattr(dvp, &dvattr, cred, td); if ((dvattr.va_mode & S_ISTXT) && fuse_internal_access(dvp, VADMIN, td, cred) && fuse_internal_access(*vpp, VADMIN, td, cred)) { err = EPERM; goto out; } } if (islastcn && ( (nameiop == DELETE) || (nameiop == RENAME && wantparent))) { cnp->cn_flags |= SAVENAME; } } } out: if (err) { if (vp != NULL && dvp != vp) vput(vp); else if (vp != NULL) vrele(vp); *vpp = NULL; } if (did_lookup) fdisp_destroy(&fdi); return err; } /* struct vnop_mkdir_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct fuse_mkdir_in fmdi; if (fuse_isdeadfs(dvp)) { return ENXIO; } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); fmdi.umask = curthread->td_proc->p_fd->fd_cmask; return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi, sizeof(fmdi), VDIR)); } /* struct vnop_mknod_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mknod(struct vop_mknod_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; if (fuse_isdeadfs(dvp)) return ENXIO; return fuse_internal_mknod(dvp, vpp, cnp, vap); } /* struct vop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; / struct file *a_fp; }; */ static int fuse_vnop_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; int a_mode = ap->a_mode; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; pid_t pid = td->td_proc->p_pid; struct fuse_vnode_data *fvdat; if (fuse_isdeadfs(vp)) return ENXIO; if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO) return (EOPNOTSUPP); if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0) return EINVAL; fvdat = VTOFUD(vp); if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) { fuse_vnode_open(vp, 0, td); return 0; } return fuse_filehandle_open(vp, a_mode, NULL, td, cred); } static int fuse_vnop_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = MIN(LONG_MAX, FUSE_LINK_MAX); return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (vop_stdpathconf(ap)); } } /* struct vnop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } return fuse_io_dispatch(vp, uio, ioflag, cred, pid); } /* struct vnop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; */ static int fuse_vnop_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_filehandle *fufh = NULL; struct fuse_iov cookediov; int err = 0; u_long *cookies; off_t startoff; ssize_t tresid; int ncookies; bool closefufh = false; pid_t pid = curthread->td_proc->p_pid; if (ap->a_eofflag) *ap->a_eofflag = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } if ( /* XXXIP ((uio_iovcnt(uio) > 1)) || */ (uio_resid(uio) < sizeof(struct dirent))) { return EINVAL; } tresid = uio->uio_resid; startoff = uio->uio_offset; err = fuse_filehandle_get_dir(vp, &fufh, cred, pid); if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { /* * nfsd will do VOP_READDIR without first doing VOP_OPEN. We * must implicitly open the directory here */ err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred); if (err == 0) { /* * When a directory is opened, it must be read from * the beginning. Hopefully, the "startoff" still * exists as an offset cookie for the directory. * If not, it will read the entire directory without * returning any entries and just return eof. */ uio->uio_offset = 0; } closefufh = true; } if (err) return (err); if (ap->a_ncookies != NULL) { ncookies = uio->uio_resid / (offsetof(struct dirent, d_name) + 4) + 1; cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } else { ncookies = 0; cookies = NULL; } #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1) fiov_init(&cookediov, DIRCOOKEDSIZE); err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov, &ncookies, cookies); fiov_teardown(&cookediov); if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); if (ap->a_ncookies != NULL) { if (err == 0) { *ap->a_ncookies -= ncookies; } else { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } } if (err == 0 && tresid == uio->uio_resid) *ap->a_eofflag = 1; return err; } /* struct vnop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; }; */ static int fuse_vnop_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_dispatcher fdi; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (!vnode_islnk(vp)) { return EINVAL; } fdisp_init(&fdi, 0); err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred); if (err) { goto out; } if (((char *)fdi.answ)[0] == '/' && fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) { char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname; err = uiomove(mpth, strlen(mpth), uio); } if (!err) { err = uiomove(fdi.answ, fdi.iosize, uio); } out: fdisp_destroy(&fdi); return err; } /* struct vnop_reclaim_args { struct vnode *a_vp; struct thread *a_td; }; */ static int fuse_vnop_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh, *fufh_tmp; if (!fvdat) { panic("FUSE: no vnode data during recycling"); } LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { printf("FUSE: vnode being reclaimed with open fufh " "(type=%#x)", fufh->fufh_type); fuse_filehandle_close(vp, fufh, td, NULL); } if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) { fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp), fvdat->nlookup); } fuse_vnode_setparent(vp, NULL); cache_purge(vp); vfs_hash_remove(vp); - vnode_destroy_vobject(vp); fuse_vnode_destroy(vp); return 0; } /* struct vnop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_isdir(vp)) { return EPERM; } err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); return err; } /* struct vnop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; }; */ static int fuse_vnop_rename(struct vop_rename_args *ap) { struct vnode *fdvp = ap->a_fdvp; struct vnode *fvp = ap->a_fvp; struct componentname *fcnp = ap->a_fcnp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct fuse_data *data; bool newparent = fdvp != tdvp; bool isdir = fvp->v_type == VDIR; int err = 0; if (fuse_isdeadfs(fdvp)) { return ENXIO; } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename"); err = EXDEV; goto out; } cache_purge(fvp); /* * FUSE library is expected to check if target directory is not * under the source directory in the file system tree. * Linux performs this check at VFS level. */ /* * If source is a directory, and it will get a new parent, user must * have write permission to it, so ".." can be modified. */ data = fuse_get_mpdata(vnode_mount(tdvp)); if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) { err = fuse_internal_access(fvp, VWRITE, tcnp->cn_thread, tcnp->cn_cred); if (err) goto out; } sx_xlock(&data->rename_lock); err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp); if (err == 0) { if (tdvp != fdvp) fuse_vnode_setparent(fvp, tdvp); if (tvp != NULL) fuse_vnode_setparent(tvp, NULL); } sx_unlock(&data->rename_lock); if (tvp != NULL && tvp != fvp) { cache_purge(tvp); } if (vnode_isdir(fvp)) { if ((tvp != NULL) && vnode_isdir(tvp)) { cache_purge(tdvp); } cache_purge(fdvp); } out: if (tdvp == tvp) { vrele(tdvp); } else { vput(tdvp); } if (tvp != NULL) { vput(tvp); } vrele(fdvp); vrele(fvp); return err; } /* struct vnop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } *ap; */ static int fuse_vnop_rmdir(struct vop_rmdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp) == VTOFUD(dvp)) { return EINVAL; } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); return err; } /* struct vnop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; struct mount *mp; struct fuse_data *data; struct vattr old_va; int dataflags; int err = 0, err2; accmode_t accmode = 0; bool checkperm; bool drop_suid = false; gid_t cr_gid; mp = vnode_mount(vp); data = fuse_get_mpdata(mp); dataflags = data->dataflags; checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS; if (cred->cr_ngroups > 0) cr_gid = cred->cr_groups[0]; else cr_gid = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vap->va_uid != (uid_t)VNOVAL) { if (checkperm) { /* Only root may change a file's owner */ err = priv_check_cred(cred, PRIV_VFS_CHOWN); if (err) { /* As a special case, allow the null chown */ err2 = fuse_internal_getattr(vp, &old_va, cred, td); if (err2) return (err2); if (vap->va_uid != old_va.va_uid) return err; else accmode |= VADMIN; drop_suid = true; } else accmode |= VADMIN; } else accmode |= VADMIN; } if (vap->va_gid != (gid_t)VNOVAL) { if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN)) drop_suid = true; if (checkperm && !groupmember(vap->va_gid, cred)) { /* * Non-root users may only chgrp to one of their own * groups */ err = priv_check_cred(cred, PRIV_VFS_CHOWN); if (err) { /* As a special case, allow the null chgrp */ err2 = fuse_internal_getattr(vp, &old_va, cred, td); if (err2) return (err2); if (vap->va_gid != old_va.va_gid) return err; accmode |= VADMIN; } else accmode |= VADMIN; } else accmode |= VADMIN; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vfs_isrdonly(mp)) return (EROFS); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support shared memory objects in the file * system, and have dubious support for truncating * symlinks. Just ignore the request in other cases. */ return (0); } /* Don't set accmode. Permission to trunc is checked upstack */ } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vap->va_vaflags & VA_UTIMES_NULL) accmode |= VWRITE; else accmode |= VADMIN; } if (drop_suid) { if (vap->va_mode != (mode_t)VNOVAL) vap->va_mode &= ~(S_ISUID | S_ISGID); else { err = fuse_internal_getattr(vp, &old_va, cred, td); if (err) return (err); vap->va_mode = old_va.va_mode & ~(S_ISUID | S_ISGID); } } if (vap->va_mode != (mode_t)VNOVAL) { /* Only root may set the sticky bit on non-directories */ if (checkperm && vp->v_type != VDIR && (vap->va_mode & S_ISTXT) && priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return EFTYPE; if (checkperm && (vap->va_mode & S_ISGID)) { err = fuse_internal_getattr(vp, &old_va, cred, td); if (err) return (err); if (!groupmember(old_va.va_gid, cred)) { err = priv_check_cred(cred, PRIV_VFS_SETGID); if (err) return (err); } } accmode |= VADMIN; } if (vfs_isrdonly(mp)) return EROFS; err = fuse_internal_access(vp, accmode, td, cred); if (err) return err; else return fuse_internal_setattr(vp, vap, td, cred); } /* struct vnop_strategy_args { struct vnode *a_vp; struct buf *a_bp; }; */ static int fuse_vnop_strategy(struct vop_strategy_args *ap) { struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; if (!vp || fuse_isdeadfs(vp)) { bp->b_ioflags |= BIO_ERROR; bp->b_error = ENXIO; bufdone(bp); return 0; } /* * VOP_STRATEGY always returns zero and signals error via bp->b_ioflags. * fuse_io_strategy sets bp's error fields */ (void)fuse_io_strategy(vp, bp); return 0; } /* struct vnop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; }; */ static int fuse_vnop_symlink(struct vop_symlink_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; const char *target = ap->a_target; struct fuse_dispatcher fdi; int err; size_t len; if (fuse_isdeadfs(dvp)) { return ENXIO; } /* * Unlike the other creator type calls, here we have to create a message * where the name of the new entry comes first, and the data describing * the entry comes second. * Hence we can't rely on our handy fuse_internal_newentry() routine, * but put together the message manually and just call the core part. */ len = strlen(target) + 1; fdisp_init(&fdi, len + cnp->cn_namelen + 1); fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len); err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi); fdisp_destroy(&fdi); return err; } /* struct vnop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_write(struct vop_write_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } return fuse_io_dispatch(vp, uio, ioflag, cred, pid); } static daddr_t fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { const int biosize = fuse_iosize(vp); return (off / biosize); } static int fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn) { off_t filesize; int blksz, err; const int biosize = fuse_iosize(vp); err = fuse_vnode_size(vp, &filesize, NULL, NULL); KASSERT(err == 0, ("vfs_bio_getpages can't handle errors here")); if (err) return biosize; if ((off_t)lbn * biosize >= filesize) { blksz = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { blksz = filesize - (off_t)lbn *biosize; } else { blksz = biosize; } return (blksz); } /* struct vnop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; }; */ static int fuse_vnop_getpages(struct vop_getpages_args *ap) { struct vnode *vp = ap->a_vp; if (!fsess_opt_mmap(vnode_mount(vp))) { SDT_PROBE2(fusefs, , vnops, trace, 1, "called on non-cacheable vnode??\n"); return (VM_PAGER_ERROR); } return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz)); } static const char extattr_namespace_separator = '.'; /* struct vop_getextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getextattr(struct vop_getextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_getxattr_in *get_xattr_in; struct fuse_getxattr_out *get_xattr_out; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; char *attr_str; size_t len; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (!fsess_isimpl(mp, FUSE_GETXATTR)) return EOPNOTSUPP; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len + sizeof(*get_xattr_in)); fdisp_make_vp(&fdi, FUSE_GETXATTR, vp, td, cred); get_xattr_in = fdi.indata; /* * Check to see whether we're querying the available size or * issuing the actual request. If we pass in 0, we get back struct * fuse_getxattr_out. If we pass in a non-zero size, we get back * that much data, without the struct fuse_getxattr_out header. */ if (uio == NULL) get_xattr_in->size = 0; else get_xattr_in->size = uio->uio_resid; attr_str = (char *)fdi.indata + sizeof(*get_xattr_in); snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_GETXATTR); err = EOPNOTSUPP; } goto out; } get_xattr_out = fdi.answ; if (ap->a_size != NULL) *ap->a_size = get_xattr_out->size; if (uio != NULL) err = uiomove(fdi.answ, fdi.iosize, uio); out: fdisp_destroy(&fdi); return (err); } /* struct vop_setextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setextattr(struct vop_setextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_setxattr_in *set_xattr_in; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (!fsess_isimpl(mp, FUSE_SETXATTR)) return EOPNOTSUPP; if (vfs_isrdonly(mp)) return EROFS; /* Deleting xattrs must use VOP_DELETEEXTATTR instead */ if (ap->a_uio == NULL) { /* * If we got here as fallback from VOP_DELETEEXTATTR, then * return EOPNOTSUPP. */ if (!fsess_isimpl(mp, FUSE_REMOVEXATTR)) return (EOPNOTSUPP); else return (EINVAL); } err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VWRITE); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len + sizeof(*set_xattr_in) + uio->uio_resid); fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, td, cred); set_xattr_in = fdi.indata; set_xattr_in->size = uio->uio_resid; attr_str = (char *)fdi.indata + sizeof(*set_xattr_in); snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = uiomove((char *)fdi.indata + sizeof(*set_xattr_in) + len, uio->uio_resid, uio); if (err != 0) { goto out; } err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_SETXATTR); err = EOPNOTSUPP; } if (err == ERESTART) { /* Can't restart after calling uiomove */ err = EINTR; } out: fdisp_destroy(&fdi); return (err); } /* * The Linux / FUSE extended attribute list is simply a collection of * NUL-terminated strings. The FreeBSD extended attribute list is a single * byte length followed by a non-NUL terminated string. So, this allows * conversion of the Linux / FUSE format to the FreeBSD format in place. * Linux attribute names are reported with the namespace as a prefix (e.g. * "user.attribute_name"), but in FreeBSD they are reported without the * namespace prefix (e.g. "attribute_name"). So, we're going from: * * user.attr_name1\0user.attr_name2\0 * * to: * * attr_name1attr_name2 * * Where "" is a single byte number of characters in the attribute name. * * Args: * prefix - exattr namespace prefix string * list, list_len - input list with namespace prefixes * bsd_list, bsd_list_len - output list compatible with bsd vfs */ static int fuse_xattrlist_convert(char *prefix, const char *list, int list_len, char *bsd_list, int *bsd_list_len) { int len, pos, dist_to_next, prefix_len; pos = 0; *bsd_list_len = 0; prefix_len = strlen(prefix); while (pos < list_len && list[pos] != '\0') { dist_to_next = strlen(&list[pos]) + 1; if (bcmp(&list[pos], prefix, prefix_len) == 0 && list[pos + prefix_len] == extattr_namespace_separator) { len = dist_to_next - (prefix_len + sizeof(extattr_namespace_separator)) - 1; if (len >= EXTATTR_MAXNAMELEN) return (ENAMETOOLONG); bsd_list[*bsd_list_len] = len; memcpy(&bsd_list[*bsd_list_len + 1], &list[pos + prefix_len + sizeof(extattr_namespace_separator)], len); *bsd_list_len += len + 1; } pos += dist_to_next; } return (0); } /* * List extended attributes * * The FUSE_LISTXATTR operation is based on Linux's listxattr(2) syscall, which * has a number of differences compared to its FreeBSD equivalent, * extattr_list_file: * * - FUSE_LISTXATTR returns all extended attributes across all namespaces, * whereas listxattr(2) only returns attributes for a single namespace * - FUSE_LISTXATTR prepends each attribute name with "namespace." * - If the provided buffer is not large enough to hold the result, * FUSE_LISTXATTR should return ERANGE, whereas listxattr is expected to * return as many results as will fit. */ /* struct vop_listextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_listextattr(struct vop_listextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_listxattr_in *list_xattr_in; struct fuse_listxattr_out *list_xattr_out; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; char *bsd_list = NULL; char *linux_list; int bsd_list_len; int linux_list_len; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (!fsess_isimpl(mp, FUSE_LISTXATTR)) return EOPNOTSUPP; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) return err; /* * Add space for a NUL and the period separator if enabled. * Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; fdisp_init(&fdi, sizeof(*list_xattr_in)); fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); /* * Retrieve Linux / FUSE compatible list size. */ list_xattr_in = fdi.indata; list_xattr_in->size = 0; err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LISTXATTR); err = EOPNOTSUPP; } goto out; } list_xattr_out = fdi.answ; linux_list_len = list_xattr_out->size; if (linux_list_len == 0) { if (ap->a_size != NULL) *ap->a_size = linux_list_len; goto out; } /* * Retrieve Linux / FUSE compatible list values. */ fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); list_xattr_in = fdi.indata; list_xattr_in->size = linux_list_len; err = fdisp_wait_answ(&fdi); if (err == ERANGE) { /* * Race detected. The attribute list must've grown since the * first FUSE_LISTXATTR call. Start over. Go all the way back * to userland so we can process signals, if necessary, before * restarting. */ err = ERESTART; goto out; } else if (err != 0) goto out; linux_list = fdi.answ; /* FUSE doesn't allow the server to return more data than requested */ if (fdi.iosize > linux_list_len) { printf("WARNING: FUSE protocol violation. Server returned " "more extended attribute data than requested; " "should've returned ERANGE instead"); } else { /* But returning less data is fine */ linux_list_len = fdi.iosize; } /* * Retrieve the BSD compatible list values. * The Linux / FUSE attribute list format isn't the same * as FreeBSD's format. So we need to transform it into * FreeBSD's format before giving it to the user. */ bsd_list = malloc(linux_list_len, M_TEMP, M_WAITOK); err = fuse_xattrlist_convert(prefix, linux_list, linux_list_len, bsd_list, &bsd_list_len); if (err != 0) goto out; if (ap->a_size != NULL) *ap->a_size = bsd_list_len; if (uio != NULL) err = uiomove(bsd_list, bsd_list_len, uio); out: free(bsd_list, M_TEMP); fdisp_destroy(&fdi); return (err); } /* struct vop_deleteextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap) { struct vnode *vp = ap->a_vp; struct fuse_dispatcher fdi; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (!fsess_isimpl(mp, FUSE_REMOVEXATTR)) return EOPNOTSUPP; if (vfs_isrdonly(mp)) return EROFS; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VWRITE); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len); fdisp_make_vp(&fdi, FUSE_REMOVEXATTR, vp, td, cred); attr_str = fdi.indata; snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_REMOVEXATTR); err = EOPNOTSUPP; } fdisp_destroy(&fdi); return (err); } /* struct vnop_print_args { struct vnode *a_vp; }; */ static int fuse_vnop_print(struct vop_print_args *ap) { struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp); printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n", (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid, (uintmax_t)fvdat->nlookup, fvdat->flag); return 0; } /* * Get an NFS filehandle for a FUSE file. * * This will only work for FUSE file systems that guarantee the uniqueness of * nodeid:generation, which most don't. */ /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ static int fuse_vnop_vptofh(struct vop_vptofh_args *ap) { struct vnode *vp = ap->a_vp; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_fid *fhp = (struct fuse_fid *)(ap->a_fhp); _Static_assert(sizeof(struct fuse_fid) <= sizeof(struct fid), "FUSE fid type is too big"); struct mount *mp = vnode_mount(vp); struct fuse_data *data = fuse_get_mpdata(mp); struct vattr va; int err; if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) return EOPNOTSUPP; err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread); if (err) return err; /*ip = VTOI(ap->a_vp);*/ /*ufhp = (struct ufid *)ap->a_fhp;*/ fhp->len = sizeof(struct fuse_fid); fhp->nid = fvdat->nid; if (fvdat->generation <= UINT32_MAX) fhp->gen = fvdat->generation; else return EOVERFLOW; return (0); } diff --git a/sys/fs/msdosfs/msdosfs_denode.c b/sys/fs/msdosfs/msdosfs_denode.c index 19ba40fed97f..858cf1b58908 100644 --- a/sys/fs/msdosfs/msdosfs_denode.c +++ b/sys/fs/msdosfs/msdosfs_denode.c @@ -1,620 +1,616 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_denode.c,v 1.28 1998/02/10 14:10:00 mrg Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MSDOSFSNODE, "msdosfs_node", "MSDOSFS vnode private part"); static int de_vncmpf(struct vnode *vp, void *arg) { struct denode *de; uint64_t *a; a = arg; de = VTODE(vp); return (de->de_inode != *a); } /* * If deget() succeeds it returns with the gotten denode locked(). * * pmp - address of msdosfsmount structure of the filesystem containing * the denode of interest. The address of * the msdosfsmount structure are used. * dirclust - which cluster bp contains, if dirclust is 0 (root directory) * diroffset is relative to the beginning of the root directory, * otherwise it is cluster relative. * diroffset - offset past begin of cluster of denode we want * depp - returns the address of the gotten denode. */ int deget(struct msdosfsmount *pmp, u_long dirclust, u_long diroffset, struct denode **depp) { int error; uint64_t inode; struct mount *mntp = pmp->pm_mountp; struct direntry *direntptr; struct denode *ldep; struct vnode *nvp, *xvp; struct buf *bp; #ifdef MSDOSFS_DEBUG printf("deget(pmp %p, dirclust %lu, diroffset %lx, depp %p)\n", pmp, dirclust, diroffset, depp); #endif /* * On FAT32 filesystems, root is a (more or less) normal * directory */ if (FAT32(pmp) && dirclust == MSDOSFSROOT) dirclust = pmp->pm_rootdirblk; /* * See if the denode is in the denode cache. Use the location of * the directory entry to compute the hash value. For subdir use * address of "." entry. For root dir (if not FAT32) use cluster * MSDOSFSROOT, offset MSDOSFSROOT_OFS * * NOTE: The check for de_refcnt > 0 below insures the denode being * examined does not represent an unlinked but still open file. * These files are not to be accessible even when the directory * entry that represented the file happens to be reused while the * deleted file is still open. */ inode = (uint64_t)pmp->pm_bpcluster * dirclust + diroffset; error = vfs_hash_get(mntp, inode, LK_EXCLUSIVE, curthread, &nvp, de_vncmpf, &inode); if (error) return (error); if (nvp != NULL) { *depp = VTODE(nvp); KASSERT((*depp)->de_dirclust == dirclust, ("wrong dirclust")); KASSERT((*depp)->de_diroffset == diroffset, ("wrong diroffset")); return (0); } ldep = malloc(sizeof(struct denode), M_MSDOSFSNODE, M_WAITOK | M_ZERO); /* * Directory entry was not in cache, have to create a vnode and * copy it from the passed disk buffer. */ /* getnewvnode() does a VREF() on the vnode */ error = getnewvnode("msdosfs", mntp, &msdosfs_vnodeops, &nvp); if (error) { *depp = NULL; free(ldep, M_MSDOSFSNODE); return error; } nvp->v_data = ldep; ldep->de_vnode = nvp; ldep->de_flag = 0; ldep->de_dirclust = dirclust; ldep->de_diroffset = diroffset; ldep->de_inode = inode; lockmgr(nvp->v_vnlock, LK_EXCLUSIVE, NULL); fc_purge(ldep, 0); /* init the FAT cache for this denode */ error = insmntque(nvp, mntp); if (error != 0) { free(ldep, M_MSDOSFSNODE); *depp = NULL; return (error); } error = vfs_hash_insert(nvp, inode, LK_EXCLUSIVE, curthread, &xvp, de_vncmpf, &inode); if (error) { *depp = NULL; return (error); } if (xvp != NULL) { *depp = xvp->v_data; return (0); } ldep->de_pmp = pmp; ldep->de_refcnt = 1; /* * Copy the directory entry into the denode area of the vnode. */ if ((dirclust == MSDOSFSROOT || (FAT32(pmp) && dirclust == pmp->pm_rootdirblk)) && diroffset == MSDOSFSROOT_OFS) { /* * Directory entry for the root directory. There isn't one, * so we manufacture one. We should probably rummage * through the root directory and find a label entry (if it * exists), and then use the time and date from that entry * as the time and date for the root denode. */ nvp->v_vflag |= VV_ROOT; /* should be further down XXX */ ldep->de_Attributes = ATTR_DIRECTORY; ldep->de_LowerCase = 0; if (FAT32(pmp)) ldep->de_StartCluster = pmp->pm_rootdirblk; /* de_FileSize will be filled in further down */ else { ldep->de_StartCluster = MSDOSFSROOT; ldep->de_FileSize = pmp->pm_rootdirsize * DEV_BSIZE; } /* * fill in time and date so that fattime2timespec() doesn't * spit up when called from msdosfs_getattr() with root * denode */ ldep->de_CHun = 0; ldep->de_CTime = 0x0000; /* 00:00:00 */ ldep->de_CDate = (0 << DD_YEAR_SHIFT) | (1 << DD_MONTH_SHIFT) | (1 << DD_DAY_SHIFT); /* Jan 1, 1980 */ ldep->de_ADate = ldep->de_CDate; ldep->de_MTime = ldep->de_CTime; ldep->de_MDate = ldep->de_CDate; /* leave the other fields as garbage */ } else { error = readep(pmp, dirclust, diroffset, &bp, &direntptr); if (error) { /* * The denode does not contain anything useful, so * it would be wrong to leave it on its hash chain. * Arrange for vput() to just forget about it. */ ldep->de_Name[0] = SLOT_DELETED; vput(nvp); *depp = NULL; return (error); } (void)DE_INTERNALIZE(ldep, direntptr); brelse(bp); } /* * Fill in a few fields of the vnode and finish filling in the * denode. Then return the address of the found denode. */ if (ldep->de_Attributes & ATTR_DIRECTORY) { /* * Since DOS directory entries that describe directories * have 0 in the filesize field, we take this opportunity * to find out the length of the directory and plug it into * the denode structure. */ u_long size; /* * XXX it sometimes happens that the "." entry has cluster * number 0 when it shouldn't. Use the actual cluster number * instead of what is written in directory entry. */ if (diroffset == 0 && ldep->de_StartCluster != dirclust) { #ifdef MSDOSFS_DEBUG printf("deget(): \".\" entry at clust %lu != %lu\n", dirclust, ldep->de_StartCluster); #endif ldep->de_StartCluster = dirclust; } nvp->v_type = VDIR; if (ldep->de_StartCluster != MSDOSFSROOT) { error = pcbmap(ldep, 0xffff, 0, &size, 0); if (error == E2BIG) { ldep->de_FileSize = de_cn2off(pmp, size); error = 0; } else { #ifdef MSDOSFS_DEBUG printf("deget(): pcbmap returned %d\n", error); #endif } } } else nvp->v_type = VREG; ldep->de_modrev = init_va_filerev(); *depp = ldep; return (0); } int deupdat(struct denode *dep, int waitfor) { struct direntry dir; struct timespec ts; struct buf *bp; struct direntry *dirp; int error; if (DETOV(dep)->v_mount->mnt_flag & MNT_RDONLY) { dep->de_flag &= ~(DE_UPDATE | DE_CREATE | DE_ACCESS | DE_MODIFIED); return (0); } vfs_timestamp(&ts); DETIMES(dep, &ts, &ts, &ts); if ((dep->de_flag & DE_MODIFIED) == 0 && waitfor == 0) return (0); dep->de_flag &= ~DE_MODIFIED; if (DETOV(dep)->v_vflag & VV_ROOT) return (EINVAL); if (dep->de_refcnt <= 0) return (0); error = readde(dep, &bp, &dirp); if (error) return (error); DE_EXTERNALIZE(&dir, dep); if (bcmp(dirp, &dir, sizeof(dir)) == 0) { if (waitfor == 0 || (bp->b_flags & B_DELWRI) == 0) { brelse(bp); return (0); } } else *dirp = dir; if ((DETOV(dep)->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) bp->b_flags |= B_CLUSTEROK; if (waitfor) error = bwrite(bp); else if (vm_page_count_severe() || buf_dirty_count_severe()) bawrite(bp); else bdwrite(bp); return (error); } /* * Truncate the file described by dep to the length specified by length. */ int detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred) { int error; int allerror; u_long eofentry; u_long chaintofree; daddr_t bn; int boff; int isadir = dep->de_Attributes & ATTR_DIRECTORY; struct buf *bp; struct msdosfsmount *pmp = dep->de_pmp; #ifdef MSDOSFS_DEBUG printf("detrunc(): file %s, length %lu, flags %x\n", dep->de_Name, length, flags); #endif /* * Disallow attempts to truncate the root directory since it is of * fixed size. That's just the way dos filesystems are. We use * the VROOT bit in the vnode because checking for the directory * bit and a startcluster of 0 in the denode is not adequate to * recognize the root directory at this point in a file or * directory's life. */ if ((DETOV(dep)->v_vflag & VV_ROOT) && !FAT32(pmp)) { #ifdef MSDOSFS_DEBUG printf("detrunc(): can't truncate root directory, clust %ld, offset %ld\n", dep->de_dirclust, dep->de_diroffset); #endif return (EINVAL); } if (dep->de_FileSize < length) { vnode_pager_setsize(DETOV(dep), length); return deextend(dep, length, cred); } /* * If the desired length is 0 then remember the starting cluster of * the file and set the StartCluster field in the directory entry * to 0. If the desired length is not zero, then get the number of * the last cluster in the shortened file. Then get the number of * the first cluster in the part of the file that is to be freed. * Then set the next cluster pointer in the last cluster of the * file to CLUST_EOFE. */ if (length == 0) { chaintofree = dep->de_StartCluster; dep->de_StartCluster = 0; eofentry = ~0; } else { error = pcbmap(dep, de_clcount(pmp, length) - 1, 0, &eofentry, 0); if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): pcbmap fails %d\n", error); #endif return (error); } } fc_purge(dep, de_clcount(pmp, length)); /* * If the new length is not a multiple of the cluster size then we * must zero the tail end of the new last cluster in case it * becomes part of the file again because of a seek. */ if ((boff = length & pmp->pm_crbomask) != 0) { if (isadir) { bn = cntobn(pmp, eofentry); error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster, NOCRED, &bp); } else { error = bread(DETOV(dep), de_cluster(pmp, length), pmp->pm_bpcluster, cred, &bp); } if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): bread fails %d\n", error); #endif return (error); } memset(bp->b_data + boff, 0, pmp->pm_bpcluster - boff); if ((flags & IO_SYNC) != 0) bwrite(bp); else bdwrite(bp); } /* * Write out the updated directory entry. Even if the update fails * we free the trailing clusters. */ dep->de_FileSize = length; if (!isadir) dep->de_flag |= DE_UPDATE | DE_MODIFIED; allerror = vtruncbuf(DETOV(dep), length, pmp->pm_bpcluster); #ifdef MSDOSFS_DEBUG if (allerror) printf("detrunc(): vtruncbuf error %d\n", allerror); #endif error = deupdat(dep, !DOINGASYNC((DETOV(dep)))); if (error != 0 && allerror == 0) allerror = error; #ifdef MSDOSFS_DEBUG printf("detrunc(): allerror %d, eofentry %lu\n", allerror, eofentry); #endif /* * If we need to break the cluster chain for the file then do it * now. */ if (eofentry != ~0) { error = fatentry(FAT_GET_AND_SET, pmp, eofentry, &chaintofree, CLUST_EOFE); if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): fatentry errors %d\n", error); #endif return (error); } fc_setcache(dep, FC_LASTFC, de_cluster(pmp, length - 1), eofentry); } /* * Now free the clusters removed from the file because of the * truncation. */ if (chaintofree != 0 && !MSDOSFSEOF(pmp, chaintofree)) freeclusterchain(pmp, chaintofree); return (allerror); } /* * Extend the file described by dep to length specified by length. */ int deextend(struct denode *dep, u_long length, struct ucred *cred) { struct msdosfsmount *pmp = dep->de_pmp; u_long count; int error; /* * The root of a DOS filesystem cannot be extended. */ if ((DETOV(dep)->v_vflag & VV_ROOT) && !FAT32(pmp)) return (EINVAL); /* * Directories cannot be extended. */ if (dep->de_Attributes & ATTR_DIRECTORY) return (EISDIR); if (length <= dep->de_FileSize) panic("deextend: file too large"); /* * Compute the number of clusters to allocate. */ count = de_clcount(pmp, length) - de_clcount(pmp, dep->de_FileSize); if (count > 0) { if (count > pmp->pm_freeclustercount) return (ENOSPC); error = extendfile(dep, count, NULL, NULL, DE_CLEAR); if (error) { /* truncate the added clusters away again */ (void) detrunc(dep, dep->de_FileSize, 0, cred); return (error); } } dep->de_FileSize = length; dep->de_flag |= DE_UPDATE | DE_MODIFIED; return (deupdat(dep, !DOINGASYNC(DETOV(dep)))); } /* * Move a denode to its correct hash queue after the file it represents has * been moved to a new directory. */ void reinsert(struct denode *dep) { struct vnode *vp; /* * Fix up the denode cache. If the denode is for a directory, * there is nothing to do since the hash is based on the starting * cluster of the directory file and that hasn't changed. If for a * file the hash is based on the location of the directory entry, * so we must remove it from the cache and re-enter it with the * hash based on the new location of the directory entry. */ #if 0 if (dep->de_Attributes & ATTR_DIRECTORY) return; #endif vp = DETOV(dep); dep->de_inode = (uint64_t)dep->de_pmp->pm_bpcluster * dep->de_dirclust + dep->de_diroffset; vfs_hash_rehash(vp, dep->de_inode); } int msdosfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); #ifdef MSDOSFS_DEBUG printf("msdosfs_reclaim(): dep %p, file %s, refcnt %ld\n", dep, dep->de_Name, dep->de_refcnt); #endif - /* - * Destroy the vm object and flush associated pages. - */ - vnode_destroy_vobject(vp); /* * Remove the denode from its hash chain. */ vfs_hash_remove(vp); /* * Purge old data structures associated with the denode. */ #if 0 /* XXX */ dep->de_flag = 0; #endif free(dep, M_MSDOSFSNODE); vp->v_data = NULL; return (0); } int msdosfs_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); int error = 0; #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): dep %p, de_Name[0] %x\n", dep, dep->de_Name[0]); #endif /* * Ignore denodes related to stale file handles. */ if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY) goto out; /* * If the file has been deleted and it is on a read/write * filesystem, then truncate the file, and mark the directory slot * as empty. (This may not be necessary for the dos filesystem.) */ #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): dep %p, refcnt %ld, mntflag %llx, MNT_RDONLY %llx\n", dep, dep->de_refcnt, (unsigned long long)vp->v_mount->mnt_flag, (unsigned long long)MNT_RDONLY); #endif if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { error = detrunc(dep, (u_long) 0, 0, NOCRED); dep->de_flag |= DE_UPDATE; dep->de_Name[0] = SLOT_DELETED; } deupdat(dep, 0); out: /* * If we are done with the denode, reclaim it * so that it can be reused immediately. */ #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): v_usecount %d, de_Name[0] %x\n", vrefcnt(vp), dep->de_Name[0]); #endif if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY) vrecycle(vp); return (error); } diff --git a/sys/fs/nfsclient/nfs_clnode.c b/sys/fs/nfsclient/nfs_clnode.c index 9e54f6e9f238..0baf710d9cfa 100644 --- a/sys/fs/nfsclient/nfs_clnode.c +++ b/sys/fs/nfsclient/nfs_clnode.c @@ -1,363 +1,358 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from nfs_node.c 8.6 (Berkeley) 5/22/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct vop_vector newnfs_vnodeops; extern struct buf_ops buf_ops_newnfs; MALLOC_DECLARE(M_NEWNFSREQ); uma_zone_t newnfsnode_zone; const char nfs_vnode_tag[] = "nfs"; static void nfs_freesillyrename(void *arg, __unused int pending); void ncl_nhinit(void) { newnfsnode_zone = uma_zcreate("NCLNODE", sizeof(struct nfsnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } void ncl_nhuninit(void) { uma_zdestroy(newnfsnode_zone); } /* * ONLY USED FOR THE ROOT DIRECTORY. nfscl_nget() does the rest. If this * function is going to be used to get Regular Files, code must be added * to fill in the "struct nfsv4node". * Look up a vnode/nfsnode by file handle. * Callers must check for mount points!! * In all cases, a pointer to a * nfsnode structure is returned. */ int ncl_nget(struct mount *mntp, u_int8_t *fhp, int fhsize, struct nfsnode **npp, int lkflags) { struct thread *td = curthread; /* XXX */ struct nfsnode *np; struct vnode *vp; struct vnode *nvp; int error; u_int hash; struct nfsmount *nmp; struct nfsfh *nfhp; nmp = VFSTONFS(mntp); *npp = NULL; hash = fnv_32_buf(fhp, fhsize, FNV1_32_INIT); nfhp = malloc(sizeof (struct nfsfh) + fhsize, M_NFSFH, M_WAITOK); bcopy(fhp, &nfhp->nfh_fh[0], fhsize); nfhp->nfh_len = fhsize; error = vfs_hash_get(mntp, hash, lkflags, td, &nvp, newnfs_vncmpf, nfhp); free(nfhp, M_NFSFH); if (error) return (error); if (nvp != NULL) { *npp = VTONFS(nvp); return (0); } np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO); error = getnewvnode(nfs_vnode_tag, mntp, &newnfs_vnodeops, &nvp); if (error) { uma_zfree(newnfsnode_zone, np); return (error); } vp = nvp; KASSERT(vp->v_bufobj.bo_bsize != 0, ("ncl_nget: bo_bsize == 0")); vp->v_bufobj.bo_ops = &buf_ops_newnfs; vp->v_data = np; np->n_vnode = vp; /* * Initialize the mutex even if the vnode is going to be a loser. * This simplifies the logic in reclaim, which can then unconditionally * destroy the mutex (in the case of the loser, or if hash_insert * happened to return an error no special casing is needed). */ mtx_init(&np->n_mtx, "NEWNFSnode lock", NULL, MTX_DEF | MTX_DUPOK); lockinit(&np->n_excl, PVFS, "nfsupg", VLKTIMEOUT, LK_NOSHARE | LK_CANRECURSE); /* * NFS supports recursive and shared locking. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_NOWITNESS, NULL); VN_LOCK_AREC(vp); VN_LOCK_ASHARE(vp); /* * Are we getting the root? If so, make sure the vnode flags * are correct */ if ((fhsize == nmp->nm_fhsize) && !bcmp(fhp, nmp->nm_fh, fhsize)) { if (vp->v_type == VNON) vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; } np->n_fhp = malloc(sizeof (struct nfsfh) + fhsize, M_NFSFH, M_WAITOK); bcopy(fhp, np->n_fhp->nfh_fh, fhsize); np->n_fhp->nfh_len = fhsize; error = insmntque(vp, mntp); if (error != 0) { *npp = NULL; free(np->n_fhp, M_NFSFH); mtx_destroy(&np->n_mtx); lockdestroy(&np->n_excl); uma_zfree(newnfsnode_zone, np); return (error); } error = vfs_hash_insert(vp, hash, lkflags, td, &nvp, newnfs_vncmpf, np->n_fhp); if (error) return (error); if (nvp != NULL) { *npp = VTONFS(nvp); /* vfs_hash_insert() vput()'s the losing vnode */ return (0); } *npp = np; return (0); } /* * Do the vrele(sp->s_dvp) as a separate task in order to avoid a * deadlock because of a LOR when vrele() locks the directory vnode. */ static void nfs_freesillyrename(void *arg, __unused int pending) { struct sillyrename *sp; sp = arg; vrele(sp->s_dvp); free(sp, M_NEWNFSREQ); } static void ncl_releasesillyrename(struct vnode *vp, struct thread *td) { struct nfsnode *np; struct sillyrename *sp; ASSERT_VOP_ELOCKED(vp, "releasesillyrename"); np = VTONFS(vp); mtx_assert(&np->n_mtx, MA_OWNED); if (vp->v_type != VDIR) { sp = np->n_sillyrename; np->n_sillyrename = NULL; } else sp = NULL; if (sp != NULL) { mtx_unlock(&np->n_mtx); (void) ncl_vinvalbuf(vp, 0, td, 1); /* * Remove the silly file that was rename'd earlier */ ncl_removeit(sp, vp); crfree(sp->s_cred); TASK_INIT(&sp->s_task, 0, nfs_freesillyrename, sp); taskqueue_enqueue(taskqueue_thread, &sp->s_task); mtx_lock(&np->n_mtx); } } int ncl_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np; boolean_t retv; if (NFS_ISV4(vp) && vp->v_type == VREG) { /* * Since mmap()'d files do I/O after VOP_CLOSE(), the NFSv4 * Close operations are delayed until now. Any dirty * buffers/pages must be flushed before the close, so that the * stateid is available for the writes. */ if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); retv = vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_object); } else retv = TRUE; if (retv == TRUE) { (void)ncl_flush(vp, MNT_WAIT, ap->a_td, 1, 0); (void)nfsrpc_close(vp, 1, ap->a_td); } } np = VTONFS(vp); mtx_lock(&np->n_mtx); ncl_releasesillyrename(vp, ap->a_td); /* * NMODIFIED means that there might be dirty/stale buffers * associated with the NFS vnode. * NDSCOMMIT means that the file is on a pNFS server and commits * should be done to the DS. * None of the other flags are meaningful after the vnode is unused. */ np->n_flag &= (NMODIFIED | NDSCOMMIT); mtx_unlock(&np->n_mtx); return (0); } /* * Reclaim an nfsnode so that it can be used for other purposes. */ int ncl_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct nfsdmap *dp, *dp2; /* * If the NLM is running, give it a chance to abort pending * locks. */ if (nfs_reclaim_p != NULL) nfs_reclaim_p(ap); mtx_lock(&np->n_mtx); ncl_releasesillyrename(vp, ap->a_td); mtx_unlock(&np->n_mtx); - /* - * Destroy the vm object and flush associated pages. - */ - vnode_destroy_vobject(vp); - if (NFS_ISV4(vp) && vp->v_type == VREG) /* * We can now safely close any remaining NFSv4 Opens for * this file. Most opens will have already been closed by * ncl_inactive(), but there are cases where it is not * called, so we need to do it again here. */ (void) nfsrpc_close(vp, 1, ap->a_td); vfs_hash_remove(vp); /* * Call nfscl_reclaimnode() to save attributes in the delegation, * as required. */ if (vp->v_type == VREG) nfscl_reclaimnode(vp); /* * Free up any directory cookie structures and * large file handle structures that might be associated with * this nfs node. */ if (vp->v_type == VDIR) { dp = LIST_FIRST(&np->n_cookies); while (dp) { dp2 = dp; dp = LIST_NEXT(dp, ndm_list); free(dp2, M_NFSDIROFF); } } if (np->n_writecred != NULL) crfree(np->n_writecred); free(np->n_fhp, M_NFSFH); if (np->n_v4 != NULL) free(np->n_v4, M_NFSV4NODE); mtx_destroy(&np->n_mtx); lockdestroy(&np->n_excl); uma_zfree(newnfsnode_zone, vp->v_data); vp->v_data = NULL; return (0); } /* * Invalidate both the access and attribute caches for this vnode. */ void ncl_invalcaches(struct vnode *vp) { struct nfsnode *np = VTONFS(vp); int i; mtx_lock(&np->n_mtx); for (i = 0; i < NFS_ACCESSCACHESIZE; i++) np->n_accesscache[i].stamp = 0; KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp); np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); mtx_unlock(&np->n_mtx); } diff --git a/sys/fs/smbfs/smbfs_node.c b/sys/fs/smbfs/smbfs_node.c index 299d07a0b4af..16b5de938464 100644 --- a/sys/fs/smbfs/smbfs_node.c +++ b/sys/fs/smbfs/smbfs_node.c @@ -1,412 +1,408 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*#include #include */ #include #include #include extern struct vop_vector smbfs_vnodeops; /* XXX -> .h file */ static MALLOC_DEFINE(M_SMBNODE, "smbufs_node", "SMBFS vnode private part"); static MALLOC_DEFINE(M_SMBNODENAME, "smbufs_nname", "SMBFS node name"); u_int32_t __inline smbfs_hash(const u_char *name, int nmlen) { return (fnv_32_buf(name, nmlen, FNV1_32_INIT)); } static char * smbfs_name_alloc(const u_char *name, int nmlen) { u_char *cp; nmlen++; cp = malloc(nmlen, M_SMBNODENAME, M_WAITOK); bcopy(name, cp, nmlen - 1); cp[nmlen - 1] = 0; return cp; } static void smbfs_name_free(u_char *name) { free(name, M_SMBNODENAME); } static int __inline smbfs_vnode_cmp(struct vnode *vp, void *_sc) { struct smbnode *np; struct smbcmp *sc; np = (struct smbnode *) vp->v_data; sc = (struct smbcmp *) _sc; if (np->n_parent != sc->n_parent || np->n_nmlen != sc->n_nmlen || bcmp(sc->n_name, np->n_name, sc->n_nmlen) != 0) return 1; return 0; } static int smbfs_node_alloc(struct mount *mp, struct vnode *dvp, const char *dirnm, int dirlen, const char *name, int nmlen, char sep, struct smbfattr *fap, struct vnode **vpp) { struct vattr vattr; struct thread *td = curthread; /* XXX */ struct smbmount *smp = VFSTOSMBFS(mp); struct smbnode *np, *dnp; struct vnode *vp, *vp2; struct smbcmp sc; char *p, *rpath; int error, rplen; sc.n_parent = dvp; sc.n_nmlen = nmlen; sc.n_name = name; if (smp->sm_root != NULL && dvp == NULL) { SMBERROR("do not allocate root vnode twice!\n"); return EINVAL; } if (nmlen == 2 && bcmp(name, "..", 2) == 0) { if (dvp == NULL) return EINVAL; vp = VTOSMB(VTOSMB(dvp)->n_parent)->n_vnode; error = vget(vp, LK_EXCLUSIVE, td); if (error == 0) *vpp = vp; return error; } else if (nmlen == 1 && name[0] == '.') { SMBERROR("do not call me with dot!\n"); return EINVAL; } dnp = dvp ? VTOSMB(dvp) : NULL; if (dnp == NULL && dvp != NULL) { vn_printf(dvp, "smbfs_node_alloc: dead parent vnode "); return EINVAL; } error = vfs_hash_get(mp, smbfs_hash(name, nmlen), LK_EXCLUSIVE, td, vpp, smbfs_vnode_cmp, &sc); if (error) return (error); if (*vpp) { np = VTOSMB(*vpp); /* Force cached attributes to be refreshed if stale. */ (void)VOP_GETATTR(*vpp, &vattr, td->td_ucred); /* * If the file type on the server is inconsistent with * what it was when we created the vnode, kill the * bogus vnode now and fall through to the code below * to create a new one with the right type. */ if (((*vpp)->v_type == VDIR && (np->n_dosattr & SMB_FA_DIR) == 0) || ((*vpp)->v_type == VREG && (np->n_dosattr & SMB_FA_DIR) != 0)) { vgone(*vpp); vput(*vpp); } else { SMBVDEBUG("vnode taken from the hashtable\n"); return (0); } } /* * If we don't have node attributes, then it is an explicit lookup * for an existing vnode. */ if (fap == NULL) return ENOENT; error = getnewvnode("smbfs", mp, &smbfs_vnodeops, vpp); if (error) return (error); vp = *vpp; np = malloc(sizeof *np, M_SMBNODE, M_WAITOK | M_ZERO); rplen = dirlen; if (sep != '\0') rplen++; rplen += nmlen; rpath = malloc(rplen + 1, M_SMBNODENAME, M_WAITOK); p = rpath; bcopy(dirnm, p, dirlen); p += dirlen; if (sep != '\0') *p++ = sep; if (name != NULL) { bcopy(name, p, nmlen); p += nmlen; } *p = '\0'; MPASS(p == rpath + rplen); lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); /* Vnode initialization */ vp->v_type = fap->fa_attr & SMB_FA_DIR ? VDIR : VREG; vp->v_data = np; np->n_vnode = vp; np->n_mount = VFSTOSMBFS(mp); np->n_rpath = rpath; np->n_rplen = rplen; np->n_nmlen = nmlen; np->n_name = smbfs_name_alloc(name, nmlen); np->n_ino = fap->fa_ino; if (dvp) { ASSERT_VOP_LOCKED(dvp, "smbfs_node_alloc"); np->n_parent = dvp; np->n_parentino = VTOSMB(dvp)->n_ino; if (/*vp->v_type == VDIR &&*/ (dvp->v_vflag & VV_ROOT) == 0) { vref(dvp); np->n_flag |= NREFPARENT; } } else if (vp->v_type == VREG) SMBERROR("new vnode '%s' born without parent ?\n", np->n_name); error = insmntque(vp, mp); if (error) { free(np, M_SMBNODE); return (error); } error = vfs_hash_insert(vp, smbfs_hash(name, nmlen), LK_EXCLUSIVE, td, &vp2, smbfs_vnode_cmp, &sc); if (error) return (error); if (vp2 != NULL) *vpp = vp2; return (0); } int smbfs_nget(struct mount *mp, struct vnode *dvp, const char *name, int nmlen, struct smbfattr *fap, struct vnode **vpp) { struct smbnode *dnp, *np; struct vnode *vp; int error, sep; dnp = (dvp) ? VTOSMB(dvp) : NULL; sep = 0; if (dnp != NULL) { sep = SMBFS_DNP_SEP(dnp); error = smbfs_node_alloc(mp, dvp, dnp->n_rpath, dnp->n_rplen, name, nmlen, sep, fap, &vp); } else error = smbfs_node_alloc(mp, NULL, "\\", 1, name, nmlen, sep, fap, &vp); if (error) return error; MPASS(vp != NULL); np = VTOSMB(vp); if (fap) smbfs_attr_cacheenter(vp, fap); *vpp = vp; return 0; } /* * Free smbnode, and give vnode back to system */ int smbfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp; struct smbnode *np = VTOSMB(vp); struct smbmount *smp = VTOSMBFS(vp); SMBVDEBUG("%s,%d\n", np->n_name, vrefcnt(vp)); KASSERT((np->n_flag & NOPEN) == 0, ("file not closed before reclaim")); - /* - * Destroy the vm object and flush associated pages. - */ - vnode_destroy_vobject(vp); dvp = (np->n_parent && (np->n_flag & NREFPARENT)) ? np->n_parent : NULL; /* * Remove the vnode from its hash chain. */ vfs_hash_remove(vp); if (np->n_name) smbfs_name_free(np->n_name); if (np->n_rpath) free(np->n_rpath, M_SMBNODENAME); free(np, M_SMBNODE); vp->v_data = NULL; if (dvp != NULL) { vrele(dvp); /* * Indicate that we released something; see comment * in smbfs_unmount(). */ smp->sm_didrele = 1; } return 0; } int smbfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct thread *td = ap->a_td; struct ucred *cred = td->td_ucred; struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; struct vattr va; SMBVDEBUG("%s: %d\n", VTOSMB(vp)->n_name, vrefcnt(vp)); if ((np->n_flag & NOPEN) != 0) { scred = smbfs_malloc_scred(); smb_makescred(scred, td, cred); smbfs_vinvalbuf(vp, td); if (vp->v_type == VREG) { VOP_GETATTR(vp, &va, cred); smbfs_smb_close(np->n_mount->sm_share, np->n_fid, &np->n_mtime, scred); } else if (vp->v_type == VDIR) { if (np->n_dirseq != NULL) { smbfs_findclose(np->n_dirseq, scred); np->n_dirseq = NULL; } } np->n_flag &= ~NOPEN; smbfs_attr_cacheremove(vp); smbfs_free_scred(scred); } if (np->n_flag & NGONE) vrecycle(vp); return (0); } /* * routines to maintain vnode attributes cache * smbfs_attr_cacheenter: unpack np.i to vattr structure */ void smbfs_attr_cacheenter(struct vnode *vp, struct smbfattr *fap) { struct smbnode *np = VTOSMB(vp); if (vp->v_type == VREG) { if (np->n_size != fap->fa_size) { np->n_size = fap->fa_size; vnode_pager_setsize(vp, np->n_size); } } else if (vp->v_type == VDIR) { np->n_size = 16384; /* should be a better way ... */ } else return; np->n_mtime = fap->fa_mtime; np->n_dosattr = fap->fa_attr; np->n_attrage = time_second; return; } int smbfs_attr_cachelookup(struct vnode *vp, struct vattr *va) { struct smbnode *np = VTOSMB(vp); struct smbmount *smp = VTOSMBFS(vp); int diff; diff = time_second - np->n_attrage; if (diff > 2) /* XXX should be configurable */ return ENOENT; va->va_type = vp->v_type; /* vnode type (for create) */ va->va_flags = 0; /* flags defined for file */ if (vp->v_type == VREG) { va->va_mode = smp->sm_file_mode; /* files access mode and type */ if (np->n_dosattr & SMB_FA_RDONLY) { va->va_mode &= ~(S_IWUSR|S_IWGRP|S_IWOTH); va->va_flags |= UF_READONLY; } } else if (vp->v_type == VDIR) { va->va_mode = smp->sm_dir_mode; /* files access mode and type */ } else return EINVAL; va->va_size = np->n_size; va->va_nlink = 1; /* number of references to file */ va->va_uid = smp->sm_uid; /* owner user id */ va->va_gid = smp->sm_gid; /* owner group id */ va->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; va->va_fileid = np->n_ino; /* file id */ if (va->va_fileid == 0) va->va_fileid = 2; va->va_blocksize = SSTOVC(smp->sm_share)->vc_txmax; va->va_mtime = np->n_mtime; va->va_atime = va->va_ctime = va->va_mtime; /* time file changed */ va->va_gen = VNOVAL; /* generation number of file */ if (np->n_dosattr & SMB_FA_HIDDEN) va->va_flags |= UF_HIDDEN; if (np->n_dosattr & SMB_FA_SYSTEM) va->va_flags |= UF_SYSTEM; /* * We don't set the archive bit for directories. */ if ((vp->v_type != VDIR) && (np->n_dosattr & SMB_FA_ARCHIVE)) va->va_flags |= UF_ARCHIVE; va->va_rdev = NODEV; /* device the special file represents */ va->va_bytes = va->va_size; /* bytes of disk space held by file */ va->va_filerev = 0; /* file modification number */ va->va_vaflags = 0; /* operations flags */ return 0; } diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c index 1b3aa6d2db92..1c181bc3a59d 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -1,1624 +1,1622 @@ /* $NetBSD: tmpfs_vnops.c,v 1.39 2007/07/23 15:41:01 jmmv Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * tmpfs vnode interface. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_tmpfs); static volatile int tmpfs_rename_restarts; SYSCTL_INT(_vfs_tmpfs, OID_AUTO, rename_restarts, CTLFLAG_RD, __DEVOLATILE(int *, &tmpfs_rename_restarts), 0, "Times rename had to restart due to lock contention"); static int tmpfs_vn_get_ino_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { return (tmpfs_alloc_vp(mp, arg, lkflags, rvp)); } static int tmpfs_lookup1(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct tmpfs_dirent *de; struct tmpfs_node *dnode, *pnode; struct tmpfs_mount *tm; int error; dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULLVP; /* Check accessibility of requested node as a first step. */ error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* We cannot be requesting the parent directory of the root node. */ MPASS(IMPLIES(dnode->tn_type == VDIR && dnode->tn_dir.tn_parent == dnode, !(cnp->cn_flags & ISDOTDOT))); TMPFS_ASSERT_LOCKED(dnode); if (dnode->tn_dir.tn_parent == NULL) { error = ENOENT; goto out; } if (cnp->cn_flags & ISDOTDOT) { tm = VFS_TO_TMPFS(dvp->v_mount); pnode = dnode->tn_dir.tn_parent; tmpfs_ref_node(pnode); error = vn_vget_ino_gen(dvp, tmpfs_vn_get_ino_alloc, pnode, cnp->cn_lkflags, vpp); tmpfs_free_node(tm, pnode); if (error != 0) goto out; } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { VREF(dvp); *vpp = dvp; error = 0; } else { de = tmpfs_dir_lookup(dnode, NULL, cnp); if (de != NULL && de->td_node == NULL) cnp->cn_flags |= ISWHITEOUT; if (de == NULL || de->td_node == NULL) { /* * The entry was not found in the directory. * This is OK if we are creating or renaming an * entry and are working on the last component of * the path name. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == CREATE || \ cnp->cn_nameiop == RENAME || (cnp->cn_nameiop == DELETE && cnp->cn_flags & DOWHITEOUT && cnp->cn_flags & ISWHITEOUT))) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* * Keep the component name in the buffer for * future uses. */ cnp->cn_flags |= SAVENAME; error = EJUSTRETURN; } else error = ENOENT; } else { struct tmpfs_node *tnode; /* * The entry was found, so get its associated * tmpfs_node. */ tnode = de->td_node; /* * If we are not at the last path component and * found a non-directory or non-link entry (which * may itself be pointing to a directory), raise * an error. */ if ((tnode->tn_type != VDIR && tnode->tn_type != VLNK) && !(cnp->cn_flags & ISLASTCN)) { error = ENOTDIR; goto out; } /* * If we are deleting or renaming the entry, keep * track of its tmpfs_dirent so that it can be * easily deleted later. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* Allocate a new vnode on the matching entry. */ error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp); if (error != 0) goto out; if ((dnode->tn_mode & S_ISTXT) && VOP_ACCESS(dvp, VADMIN, cnp->cn_cred, cnp->cn_thread) && VOP_ACCESS(*vpp, VADMIN, cnp->cn_cred, cnp->cn_thread)) { error = EPERM; vput(*vpp); *vpp = NULL; goto out; } cnp->cn_flags |= SAVENAME; } else { error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp); if (error != 0) goto out; } } } /* * Store the result of this lookup in the cache. Avoid this if the * request was for creation, as it does not improve timings on * emprical tests. */ if ((cnp->cn_flags & MAKEENTRY) != 0 && tmpfs_use_nc(dvp)) cache_enter(dvp, *vpp, cnp); out: /* * If there were no errors, *vpp cannot be null and it must be * locked. */ MPASS(IFF(error == 0, *vpp != NULLVP && VOP_ISLOCKED(*vpp))); return (error); } static int tmpfs_cached_lookup(struct vop_cachedlookup_args *v) { return (tmpfs_lookup1(v->a_dvp, v->a_vpp, v->a_cnp)); } static int tmpfs_lookup(struct vop_lookup_args *v) { return (tmpfs_lookup1(v->a_dvp, v->a_vpp, v->a_cnp)); } static int tmpfs_create(struct vop_create_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; int error; MPASS(vap->va_type == VREG || vap->va_type == VSOCK); error = tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL); if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0 && tmpfs_use_nc(dvp)) cache_enter(dvp, *vpp, cnp); return (error); } static int tmpfs_mknod(struct vop_mknod_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; if (vap->va_type != VBLK && vap->va_type != VCHR && vap->va_type != VFIFO) return EINVAL; return tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL); } static int tmpfs_open(struct vop_open_args *v) { struct vnode *vp = v->a_vp; int mode = v->a_mode; int error; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp)); node = VP_TO_TMPFS_NODE(vp); /* The file is still active but all its names have been removed * (e.g. by a "rmdir $(pwd)"). It cannot be opened any more as * it is about to die. */ if (node->tn_links < 1) return (ENOENT); /* If the file is marked append-only, deny write requests. */ if (node->tn_flags & APPEND && (mode & (FWRITE | O_APPEND)) == FWRITE) error = EPERM; else { error = 0; /* For regular files, the call below is nop. */ KASSERT(vp->v_type != VREG || (node->tn_reg.tn_aobj->flags & OBJ_DEAD) == 0, ("dead object")); vnode_create_vobject(vp, node->tn_size, v->a_td); } MPASS(VOP_ISLOCKED(vp)); return error; } static int tmpfs_close(struct vop_close_args *v) { struct vnode *vp = v->a_vp; /* Update node times. */ tmpfs_update(vp); return (0); } int tmpfs_access(struct vop_access_args *v) { struct vnode *vp = v->a_vp; accmode_t accmode = v->a_accmode; struct ucred *cred = v->a_cred; int error; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(vp)); node = VP_TO_TMPFS_NODE(vp); switch (vp->v_type) { case VDIR: /* FALLTHROUGH */ case VLNK: /* FALLTHROUGH */ case VREG: if (accmode & VWRITE && vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } break; case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VSOCK: /* FALLTHROUGH */ case VFIFO: break; default: error = EINVAL; goto out; } if (accmode & VWRITE && node->tn_flags & IMMUTABLE) { error = EPERM; goto out; } error = vaccess(vp->v_type, node->tn_mode, node->tn_uid, node->tn_gid, accmode, cred, NULL); out: MPASS(VOP_ISLOCKED(vp)); return error; } int tmpfs_getattr(struct vop_getattr_args *v) { struct vnode *vp = v->a_vp; struct vattr *vap = v->a_vap; vm_object_t obj; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); tmpfs_update(vp); vap->va_type = vp->v_type; vap->va_mode = node->tn_mode; vap->va_nlink = node->tn_links; vap->va_uid = node->tn_uid; vap->va_gid = node->tn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = node->tn_id; vap->va_size = node->tn_size; vap->va_blocksize = PAGE_SIZE; vap->va_atime = node->tn_atime; vap->va_mtime = node->tn_mtime; vap->va_ctime = node->tn_ctime; vap->va_birthtime = node->tn_birthtime; vap->va_gen = node->tn_gen; vap->va_flags = node->tn_flags; vap->va_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ? node->tn_rdev : NODEV; if (vp->v_type == VREG) { obj = node->tn_reg.tn_aobj; vap->va_bytes = (u_quad_t)obj->resident_page_count * PAGE_SIZE; } else vap->va_bytes = node->tn_size; vap->va_filerev = 0; return 0; } int tmpfs_setattr(struct vop_setattr_args *v) { struct vnode *vp = v->a_vp; struct vattr *vap = v->a_vap; struct ucred *cred = v->a_cred; struct thread *td = curthread; int error; MPASS(VOP_ISLOCKED(vp)); error = 0; /* Abort if any unsettable attribute is given. */ if (vap->va_type != VNON || vap->va_nlink != VNOVAL || vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL || vap->va_blocksize != VNOVAL || vap->va_gen != VNOVAL || vap->va_rdev != VNOVAL || vap->va_bytes != VNOVAL) error = EINVAL; if (error == 0 && (vap->va_flags != VNOVAL)) error = tmpfs_chflags(vp, vap->va_flags, cred, td); if (error == 0 && (vap->va_size != VNOVAL)) error = tmpfs_chsize(vp, vap->va_size, cred, td); if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL)) error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred, td); if (error == 0 && (vap->va_mode != (mode_t)VNOVAL)) error = tmpfs_chmod(vp, vap->va_mode, cred, td); if (error == 0 && ((vap->va_atime.tv_sec != VNOVAL && vap->va_atime.tv_nsec != VNOVAL) || (vap->va_mtime.tv_sec != VNOVAL && vap->va_mtime.tv_nsec != VNOVAL) || (vap->va_birthtime.tv_sec != VNOVAL && vap->va_birthtime.tv_nsec != VNOVAL))) error = tmpfs_chtimes(vp, vap, cred, td); /* Update the node times. We give preference to the error codes * generated by this function rather than the ones that may arise * from tmpfs_update. */ tmpfs_update(vp); MPASS(VOP_ISLOCKED(vp)); return error; } static int tmpfs_read(struct vop_read_args *v) { struct vnode *vp; struct uio *uio; struct tmpfs_node *node; vp = v->a_vp; if (vp->v_type != VREG) return (EISDIR); uio = v->a_uio; if (uio->uio_offset < 0) return (EINVAL); node = VP_TO_TMPFS_NODE(vp); tmpfs_set_status(VFS_TO_TMPFS(vp->v_mount), node, TMPFS_NODE_ACCESSED); return (uiomove_object(node->tn_reg.tn_aobj, node->tn_size, uio)); } static int tmpfs_write(struct vop_write_args *v) { struct vnode *vp; struct uio *uio; struct tmpfs_node *node; off_t oldsize; int error, ioflag; vp = v->a_vp; uio = v->a_uio; ioflag = v->a_ioflag; error = 0; node = VP_TO_TMPFS_NODE(vp); oldsize = node->tn_size; if (uio->uio_offset < 0 || vp->v_type != VREG) return (EINVAL); if (uio->uio_resid == 0) return (0); if (ioflag & IO_APPEND) uio->uio_offset = node->tn_size; if (uio->uio_offset + uio->uio_resid > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) return (EFBIG); if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); if (uio->uio_offset + uio->uio_resid > node->tn_size) { error = tmpfs_reg_resize(vp, uio->uio_offset + uio->uio_resid, FALSE); if (error != 0) goto out; } error = uiomove_object(node->tn_reg.tn_aobj, node->tn_size, uio); node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED; if (node->tn_mode & (S_ISUID | S_ISGID)) { if (priv_check_cred(v->a_cred, PRIV_VFS_RETAINSUGID)) node->tn_mode &= ~(S_ISUID | S_ISGID); } if (error != 0) (void)tmpfs_reg_resize(vp, oldsize, TRUE); out: MPASS(IMPLIES(error == 0, uio->uio_resid == 0)); MPASS(IMPLIES(error != 0, oldsize == node->tn_size)); return (error); } static int tmpfs_fsync(struct vop_fsync_args *v) { struct vnode *vp = v->a_vp; MPASS(VOP_ISLOCKED(vp)); tmpfs_check_mtime(vp); tmpfs_update(vp); return 0; } static int tmpfs_remove(struct vop_remove_args *v) { struct vnode *dvp = v->a_dvp; struct vnode *vp = v->a_vp; int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp)); MPASS(VOP_ISLOCKED(vp)); if (vp->v_type == VDIR) { error = EISDIR; goto out; } dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); de = tmpfs_dir_lookup(dnode, node, v->a_cnp); MPASS(de != NULL); /* Files marked as immutable or append-only cannot be deleted. */ if ((node->tn_flags & (IMMUTABLE | APPEND | NOUNLINK)) || (dnode->tn_flags & APPEND)) { error = EPERM; goto out; } /* Remove the entry from the directory; as it is a file, we do not * have to change the number of hard links of the directory. */ tmpfs_dir_detach(dvp, de); if (v->a_cnp->cn_flags & DOWHITEOUT) tmpfs_dir_whiteout_add(dvp, v->a_cnp); /* Free the directory entry we just deleted. Note that the node * referred by it will not be removed until the vnode is really * reclaimed. */ tmpfs_free_dirent(tmp, de); node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED; error = 0; out: return error; } static int tmpfs_link(struct vop_link_args *v) { struct vnode *dvp = v->a_tdvp; struct vnode *vp = v->a_vp; struct componentname *cnp = v->a_cnp; int error; struct tmpfs_dirent *de; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp)); MPASS(cnp->cn_flags & HASBUF); MPASS(dvp != vp); /* XXX When can this be false? */ node = VP_TO_TMPFS_NODE(vp); /* Ensure that we do not overflow the maximum number of links imposed * by the system. */ MPASS(node->tn_links <= TMPFS_LINK_MAX); if (node->tn_links == TMPFS_LINK_MAX) { error = EMLINK; goto out; } /* We cannot create links of files marked immutable or append-only. */ if (node->tn_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } /* Allocate a new directory entry to represent the node. */ error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), node, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) goto out; /* Insert the new directory entry into the appropriate directory. */ if (cnp->cn_flags & ISWHITEOUT) tmpfs_dir_whiteout_remove(dvp, cnp); tmpfs_dir_attach(dvp, de); /* vp link count has changed, so update node times. */ node->tn_status |= TMPFS_NODE_CHANGED; tmpfs_update(vp); error = 0; out: return error; } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int tmpfs_rename_relock(struct vnode *fdvp, struct vnode **fvpp, struct vnode *tdvp, struct vnode **tvpp, struct componentname *fcnp, struct componentname *tcnp) { struct vnode *nvp; struct mount *mp; struct tmpfs_dirent *de; int error, restarts = 0; VOP_UNLOCK(tdvp, 0); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK(*tvpp, 0); mp = fdvp->v_mount; relock: restarts += 1; error = vn_lock(fdvp, LK_EXCLUSIVE); if (error) goto releout; if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(fdvp, 0); error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto releout; VOP_UNLOCK(tdvp, 0); goto relock; } /* * Re-resolve fvp to be certain it still exists and fetch the * correct vnode. */ de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(fdvp), NULL, fcnp); if (de == NULL) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); if ((fcnp->cn_flags & ISDOTDOT) != 0 || (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.')) error = EINVAL; else error = ENOENT; goto releout; } error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (error != 0) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); if (error != EBUSY) goto releout; error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; VOP_UNLOCK(nvp, 0); /* * Concurrent rename race. */ if (nvp == tdvp) { vrele(nvp); error = EINVAL; goto releout; } vrele(*fvpp); *fvpp = nvp; goto relock; } vrele(*fvpp); *fvpp = nvp; VOP_UNLOCK(*fvpp, 0); /* * Re-resolve tvp and acquire the vnode lock if present. */ de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(tdvp), NULL, tcnp); /* * If tvp disappeared we just carry on. */ if (de == NULL && *tvpp != NULL) { vrele(*tvpp); *tvpp = NULL; } /* * Get the tvp ino if the lookup succeeded. We may have to restart * if the non-blocking acquire fails. */ if (de != NULL) { nvp = NULL; error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (*tvpp != NULL) vrele(*tvpp); *tvpp = nvp; if (error != 0) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); if (error != EBUSY) goto releout; error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; VOP_UNLOCK(nvp, 0); /* * fdvp contains fvp, thus tvp (=fdvp) is not empty. */ if (nvp == fdvp) { error = ENOTEMPTY; goto releout; } goto relock; } } tmpfs_rename_restarts += restarts; return (0); releout: vrele(fdvp); vrele(*fvpp); vrele(tdvp); if (*tvpp != NULL) vrele(*tvpp); tmpfs_rename_restarts += restarts; return (error); } static int tmpfs_rename(struct vop_rename_args *v) { struct vnode *fdvp = v->a_fdvp; struct vnode *fvp = v->a_fvp; struct componentname *fcnp = v->a_fcnp; struct vnode *tdvp = v->a_tdvp; struct vnode *tvp = v->a_tvp; struct componentname *tcnp = v->a_tcnp; struct mount *mp = NULL; char *newname; int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *fdnode; struct tmpfs_node *fnode; struct tmpfs_node *tnode; struct tmpfs_node *tdnode; MPASS(VOP_ISLOCKED(tdvp)); MPASS(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp))); MPASS(fcnp->cn_flags & HASBUF); MPASS(tcnp->cn_flags & HASBUF); /* Disallow cross-device renames. * XXX Why isn't this done by the caller? */ if (fvp->v_mount != tdvp->v_mount || (tvp != NULL && fvp->v_mount != tvp->v_mount)) { error = EXDEV; goto out; } /* If source and target are the same file, there is nothing to do. */ if (fvp == tvp) { error = 0; goto out; } /* If we need to move the directory between entries, lock the * source so that we can safely operate on it. */ if (fdvp != tdvp && fdvp != tvp) { if (vn_lock(fdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { mp = tdvp->v_mount; error = vfs_busy(mp, 0); if (error != 0) { mp = NULL; goto out; } error = tmpfs_rename_relock(fdvp, &fvp, tdvp, &tvp, fcnp, tcnp); if (error != 0) { vfs_unbusy(mp); return (error); } ASSERT_VOP_ELOCKED(fdvp, "tmpfs_rename: fdvp not locked"); ASSERT_VOP_ELOCKED(tdvp, "tmpfs_rename: tdvp not locked"); if (tvp != NULL) ASSERT_VOP_ELOCKED(tvp, "tmpfs_rename: tvp not locked"); if (fvp == tvp) { error = 0; goto out_locked; } } } tmp = VFS_TO_TMPFS(tdvp->v_mount); tdnode = VP_TO_TMPFS_DIR(tdvp); tnode = (tvp == NULL) ? NULL : VP_TO_TMPFS_NODE(tvp); fdnode = VP_TO_TMPFS_DIR(fdvp); fnode = VP_TO_TMPFS_NODE(fvp); de = tmpfs_dir_lookup(fdnode, fnode, fcnp); /* Entry can disappear before we lock fdvp, * also avoid manipulating '.' and '..' entries. */ if (de == NULL) { if ((fcnp->cn_flags & ISDOTDOT) != 0 || (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.')) error = EINVAL; else error = ENOENT; goto out_locked; } MPASS(de->td_node == fnode); /* If re-naming a directory to another preexisting directory * ensure that the target directory is empty so that its * removal causes no side effects. * Kern_rename guarantees the destination to be a directory * if the source is one. */ if (tvp != NULL) { MPASS(tnode != NULL); if ((tnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (tdnode->tn_flags & (APPEND | IMMUTABLE))) { error = EPERM; goto out_locked; } if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) { if (tnode->tn_size > 0) { error = ENOTEMPTY; goto out_locked; } } else if (fnode->tn_type == VDIR && tnode->tn_type != VDIR) { error = ENOTDIR; goto out_locked; } else if (fnode->tn_type != VDIR && tnode->tn_type == VDIR) { error = EISDIR; goto out_locked; } else { MPASS(fnode->tn_type != VDIR && tnode->tn_type != VDIR); } } if ((fnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (fdnode->tn_flags & (APPEND | IMMUTABLE))) { error = EPERM; goto out_locked; } /* Ensure that we have enough memory to hold the new name, if it * has to be changed. */ if (fcnp->cn_namelen != tcnp->cn_namelen || bcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen) != 0) { newname = malloc(tcnp->cn_namelen, M_TMPFSNAME, M_WAITOK); } else newname = NULL; /* If the node is being moved to another directory, we have to do * the move. */ if (fdnode != tdnode) { /* In case we are moving a directory, we have to adjust its * parent to point to the new parent. */ if (de->td_node->tn_type == VDIR) { struct tmpfs_node *n; /* Ensure the target directory is not a child of the * directory being moved. Otherwise, we'd end up * with stale nodes. */ n = tdnode; /* TMPFS_LOCK garanties that no nodes are freed while * traversing the list. Nodes can only be marked as * removed: tn_parent == NULL. */ TMPFS_LOCK(tmp); TMPFS_NODE_LOCK(n); while (n != n->tn_dir.tn_parent) { struct tmpfs_node *parent; if (n == fnode) { TMPFS_NODE_UNLOCK(n); TMPFS_UNLOCK(tmp); error = EINVAL; if (newname != NULL) free(newname, M_TMPFSNAME); goto out_locked; } parent = n->tn_dir.tn_parent; TMPFS_NODE_UNLOCK(n); if (parent == NULL) { n = NULL; break; } TMPFS_NODE_LOCK(parent); if (parent->tn_dir.tn_parent == NULL) { TMPFS_NODE_UNLOCK(parent); n = NULL; break; } n = parent; } TMPFS_UNLOCK(tmp); if (n == NULL) { error = EINVAL; if (newname != NULL) free(newname, M_TMPFSNAME); goto out_locked; } TMPFS_NODE_UNLOCK(n); /* Adjust the parent pointer. */ TMPFS_VALIDATE_DIR(fnode); TMPFS_NODE_LOCK(de->td_node); de->td_node->tn_dir.tn_parent = tdnode; TMPFS_NODE_UNLOCK(de->td_node); /* As a result of changing the target of the '..' * entry, the link count of the source and target * directories has to be adjusted. */ TMPFS_NODE_LOCK(tdnode); TMPFS_ASSERT_LOCKED(tdnode); tdnode->tn_links++; TMPFS_NODE_UNLOCK(tdnode); TMPFS_NODE_LOCK(fdnode); TMPFS_ASSERT_LOCKED(fdnode); fdnode->tn_links--; TMPFS_NODE_UNLOCK(fdnode); } } /* Do the move: just remove the entry from the source directory * and insert it into the target one. */ tmpfs_dir_detach(fdvp, de); if (fcnp->cn_flags & DOWHITEOUT) tmpfs_dir_whiteout_add(fdvp, fcnp); if (tcnp->cn_flags & ISWHITEOUT) tmpfs_dir_whiteout_remove(tdvp, tcnp); /* If the name has changed, we need to make it effective by changing * it in the directory entry. */ if (newname != NULL) { MPASS(tcnp->cn_namelen <= MAXNAMLEN); free(de->ud.td_name, M_TMPFSNAME); de->ud.td_name = newname; tmpfs_dirent_init(de, tcnp->cn_nameptr, tcnp->cn_namelen); fnode->tn_status |= TMPFS_NODE_CHANGED; tdnode->tn_status |= TMPFS_NODE_MODIFIED; } /* If we are overwriting an entry, we have to remove the old one * from the target directory. */ if (tvp != NULL) { struct tmpfs_dirent *tde; /* Remove the old entry from the target directory. */ tde = tmpfs_dir_lookup(tdnode, tnode, tcnp); tmpfs_dir_detach(tdvp, tde); /* Free the directory entry we just deleted. Note that the * node referred by it will not be removed until the vnode is * really reclaimed. */ tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), tde); } tmpfs_dir_attach(tdvp, de); if (tmpfs_use_nc(fvp)) { cache_purge(fvp); if (tvp != NULL) cache_purge(tvp); cache_purge_negative(tdvp); } error = 0; out_locked: if (fdvp != tdvp && fdvp != tvp) VOP_UNLOCK(fdvp, 0); out: /* Release target nodes. */ /* XXX: I don't understand when tdvp can be the same as tvp, but * other code takes care of this... */ if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp != NULL) vput(tvp); /* Release source nodes. */ vrele(fdvp); vrele(fvp); if (mp != NULL) vfs_unbusy(mp); return error; } static int tmpfs_mkdir(struct vop_mkdir_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; MPASS(vap->va_type == VDIR); return tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL); } static int tmpfs_rmdir(struct vop_rmdir_args *v) { struct vnode *dvp = v->a_dvp; struct vnode *vp = v->a_vp; int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; MPASS(VOP_ISLOCKED(dvp)); MPASS(VOP_ISLOCKED(vp)); tmp = VFS_TO_TMPFS(dvp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); node = VP_TO_TMPFS_DIR(vp); /* Directories with more than two entries ('.' and '..') cannot be * removed. */ if (node->tn_size > 0) { error = ENOTEMPTY; goto out; } if ((dnode->tn_flags & APPEND) || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } /* This invariant holds only if we are not trying to remove "..". * We checked for that above so this is safe now. */ MPASS(node->tn_dir.tn_parent == dnode); /* Get the directory entry associated with node (vp). This was * filled by tmpfs_lookup while looking up the entry. */ de = tmpfs_dir_lookup(dnode, node, v->a_cnp); MPASS(TMPFS_DIRENT_MATCHES(de, v->a_cnp->cn_nameptr, v->a_cnp->cn_namelen)); /* Check flags to see if we are allowed to remove the directory. */ if ((dnode->tn_flags & APPEND) != 0 || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) != 0) { error = EPERM; goto out; } /* Detach the directory entry from the directory (dnode). */ tmpfs_dir_detach(dvp, de); if (v->a_cnp->cn_flags & DOWHITEOUT) tmpfs_dir_whiteout_add(dvp, v->a_cnp); /* No vnode should be allocated for this entry from this point */ TMPFS_NODE_LOCK(node); node->tn_links--; node->tn_dir.tn_parent = NULL; node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; TMPFS_NODE_UNLOCK(node); TMPFS_NODE_LOCK(dnode); dnode->tn_links--; dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; TMPFS_NODE_UNLOCK(dnode); if (tmpfs_use_nc(dvp)) { cache_purge(dvp); cache_purge(vp); } /* Free the directory entry we just deleted. Note that the node * referred by it will not be removed until the vnode is really * reclaimed. */ tmpfs_free_dirent(tmp, de); /* Release the deleted vnode (will destroy the node, notify * interested parties and clean it from the cache). */ dnode->tn_status |= TMPFS_NODE_CHANGED; tmpfs_update(dvp); error = 0; out: return error; } static int tmpfs_symlink(struct vop_symlink_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; struct vattr *vap = v->a_vap; const char *target = v->a_target; #ifdef notyet /* XXX FreeBSD BUG: kern_symlink is not setting VLNK */ MPASS(vap->va_type == VLNK); #else vap->va_type = VLNK; #endif return tmpfs_alloc_file(dvp, vpp, vap, cnp, target); } static int tmpfs_readdir(struct vop_readdir_args *va) { struct vnode *vp; struct uio *uio; struct tmpfs_mount *tm; struct tmpfs_node *node; u_long **cookies; int *eofflag, *ncookies; ssize_t startresid; int error, maxcookies; vp = va->a_vp; uio = va->a_uio; eofflag = va->a_eofflag; cookies = va->a_cookies; ncookies = va->a_ncookies; /* This operation only makes sense on directory nodes. */ if (vp->v_type != VDIR) return ENOTDIR; maxcookies = 0; node = VP_TO_TMPFS_DIR(vp); tm = VFS_TO_TMPFS(vp->v_mount); startresid = uio->uio_resid; /* Allocate cookies for NFS and compat modules. */ if (cookies != NULL && ncookies != NULL) { maxcookies = howmany(node->tn_size, sizeof(struct tmpfs_dirent)) + 2; *cookies = malloc(maxcookies * sizeof(**cookies), M_TEMP, M_WAITOK); *ncookies = 0; } if (cookies == NULL) error = tmpfs_dir_getdents(tm, node, uio, 0, NULL, NULL); else error = tmpfs_dir_getdents(tm, node, uio, maxcookies, *cookies, ncookies); /* Buffer was filled without hitting EOF. */ if (error == EJUSTRETURN) error = (uio->uio_resid != startresid) ? 0 : EINVAL; if (error != 0 && cookies != NULL && ncookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } if (eofflag != NULL) *eofflag = (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF); return error; } static int tmpfs_readlink(struct vop_readlink_args *v) { struct vnode *vp = v->a_vp; struct uio *uio = v->a_uio; int error; struct tmpfs_node *node; MPASS(uio->uio_offset == 0); MPASS(vp->v_type == VLNK); node = VP_TO_TMPFS_NODE(vp); error = uiomove(node->tn_link, MIN(node->tn_size, uio->uio_resid), uio); tmpfs_set_status(VFS_TO_TMPFS(vp->v_mount), node, TMPFS_NODE_ACCESSED); return (error); } static int tmpfs_inactive(struct vop_inactive_args *v) { struct vnode *vp; struct tmpfs_node *node; vp = v->a_vp; node = VP_TO_TMPFS_NODE(vp); if (node->tn_links == 0) vrecycle(vp); else tmpfs_check_mtime(vp); return (0); } static int tmpfs_need_inactive(struct vop_need_inactive_args *ap) { struct vnode *vp; struct tmpfs_node *node; struct vm_object *obj; vp = ap->a_vp; node = VP_TO_TMPFS_NODE(vp); if (node->tn_links == 0) goto need; if (vp->v_type == VREG) { obj = vp->v_object; if ((obj->flags & OBJ_TMPFS_DIRTY) != 0) goto need; } return (0); need: return (1); } int tmpfs_reclaim(struct vop_reclaim_args *v) { struct vnode *vp = v->a_vp; struct tmpfs_mount *tmp; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); if (vp->v_type == VREG) tmpfs_destroy_vobject(vp, node->tn_reg.tn_aobj); - else - vnode_destroy_vobject(vp); vp->v_object = NULL; if (tmpfs_use_nc(vp)) cache_purge(vp); TMPFS_NODE_LOCK(node); tmpfs_free_vp(vp); /* If the node referenced by this vnode was deleted by the user, * we must free its associated data structures (now that the vnode * is being reclaimed). */ if (node->tn_links == 0 && (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0) { node->tn_vpstate = TMPFS_VNODE_DOOMED; TMPFS_NODE_UNLOCK(node); tmpfs_free_node(tmp, node); } else TMPFS_NODE_UNLOCK(node); MPASS(vp->v_data == NULL); return 0; } int tmpfs_print(struct vop_print_args *v) { struct vnode *vp = v->a_vp; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); printf("tag VT_TMPFS, tmpfs_node %p, flags 0x%lx, links %jd\n", node, node->tn_flags, (uintmax_t)node->tn_links); printf("\tmode 0%o, owner %d, group %d, size %jd, status 0x%x\n", node->tn_mode, node->tn_uid, node->tn_gid, (intmax_t)node->tn_size, node->tn_status); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return 0; } int tmpfs_pathconf(struct vop_pathconf_args *v) { struct vnode *vp = v->a_vp; int name = v->a_name; long *retval = v->a_retval; int error; error = 0; switch (name) { case _PC_LINK_MAX: *retval = TMPFS_LINK_MAX; break; case _PC_NAME_MAX: *retval = NAME_MAX; break; case _PC_PIPE_BUF: if (vp->v_type == VDIR || vp->v_type == VFIFO) *retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *retval = 1; break; case _PC_NO_TRUNC: *retval = 1; break; case _PC_SYNC_IO: *retval = 1; break; case _PC_FILESIZEBITS: *retval = 64; break; default: error = vop_stdpathconf(v); } return error; } static int tmpfs_vptofh(struct vop_vptofh_args *ap) { struct tmpfs_fid *tfhp; struct tmpfs_node *node; tfhp = (struct tmpfs_fid *)ap->a_fhp; node = VP_TO_TMPFS_NODE(ap->a_vp); tfhp->tf_len = sizeof(struct tmpfs_fid); tfhp->tf_id = node->tn_id; tfhp->tf_gen = node->tn_gen; return (0); } static int tmpfs_whiteout(struct vop_whiteout_args *ap) { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct tmpfs_dirent *de; switch (ap->a_flags) { case LOOKUP: return (0); case CREATE: de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); if (de != NULL) return (de->td_node == NULL ? 0 : EEXIST); return (tmpfs_dir_whiteout_add(dvp, cnp)); case DELETE: tmpfs_dir_whiteout_remove(dvp, cnp); return (0); default: panic("tmpfs_whiteout: unknown op"); } } static int tmpfs_vptocnp_dir(struct tmpfs_node *tn, struct tmpfs_node *tnp, struct tmpfs_dirent **pde) { struct tmpfs_dir_cursor dc; struct tmpfs_dirent *de; for (de = tmpfs_dir_first(tnp, &dc); de != NULL; de = tmpfs_dir_next(tnp, &dc)) { if (de->td_node == tn) { *pde = de; return (0); } } return (ENOENT); } static int tmpfs_vptocnp_fill(struct vnode *vp, struct tmpfs_node *tn, struct tmpfs_node *tnp, char *buf, int *buflen, struct vnode **dvp) { struct tmpfs_dirent *de; int error, i; error = vn_vget_ino_gen(vp, tmpfs_vn_get_ino_alloc, tnp, LK_SHARED, dvp); if (error != 0) return (error); error = tmpfs_vptocnp_dir(tn, tnp, &de); if (error == 0) { i = *buflen; i -= de->td_namelen; if (i < 0) { error = ENOMEM; } else { bcopy(de->ud.td_name, buf + i, de->td_namelen); *buflen = i; } } if (error == 0) { if (vp != *dvp) VOP_UNLOCK(*dvp, 0); } else { if (vp != *dvp) vput(*dvp); else vrele(vp); } return (error); } static int tmpfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp, **dvp; struct tmpfs_node *tn, *tnp, *tnp1; struct tmpfs_dirent *de; struct tmpfs_mount *tm; char *buf; int *buflen; int error; vp = ap->a_vp; dvp = ap->a_vpp; buf = ap->a_buf; buflen = ap->a_buflen; tm = VFS_TO_TMPFS(vp->v_mount); tn = VP_TO_TMPFS_NODE(vp); if (tn->tn_type == VDIR) { tnp = tn->tn_dir.tn_parent; if (tnp == NULL) return (ENOENT); tmpfs_ref_node(tnp); error = tmpfs_vptocnp_fill(vp, tn, tn->tn_dir.tn_parent, buf, buflen, dvp); tmpfs_free_node(tm, tnp); return (error); } restart: TMPFS_LOCK(tm); LIST_FOREACH_SAFE(tnp, &tm->tm_nodes_used, tn_entries, tnp1) { if (tnp->tn_type != VDIR) continue; TMPFS_NODE_LOCK(tnp); tmpfs_ref_node_locked(tnp); /* * tn_vnode cannot be instantiated while we hold the * node lock, so the directory cannot be changed while * we iterate over it. Do this to avoid instantiating * vnode for directories which cannot point to our * node. */ error = tnp->tn_vnode == NULL ? tmpfs_vptocnp_dir(tn, tnp, &de) : 0; if (error == 0) { TMPFS_NODE_UNLOCK(tnp); TMPFS_UNLOCK(tm); error = tmpfs_vptocnp_fill(vp, tn, tnp, buf, buflen, dvp); if (error == 0) { tmpfs_free_node(tm, tnp); return (0); } if ((vp->v_iflag & VI_DOOMED) != 0) { tmpfs_free_node(tm, tnp); return (ENOENT); } TMPFS_LOCK(tm); TMPFS_NODE_LOCK(tnp); } if (tmpfs_free_node_locked(tm, tnp, false)) { goto restart; } else { KASSERT(tnp->tn_refcount > 0, ("node %p refcount zero", tnp)); tnp1 = LIST_NEXT(tnp, tn_entries); TMPFS_NODE_UNLOCK(tnp); } } TMPFS_UNLOCK(tm); return (ENOENT); } /* * Vnode operations vector used for files stored in a tmpfs file system. */ struct vop_vector tmpfs_vnodeop_entries = { .vop_default = &default_vnodeops, .vop_lookup = vfs_cache_lookup, .vop_cachedlookup = tmpfs_cached_lookup, .vop_create = tmpfs_create, .vop_mknod = tmpfs_mknod, .vop_open = tmpfs_open, .vop_close = tmpfs_close, .vop_access = tmpfs_access, .vop_getattr = tmpfs_getattr, .vop_setattr = tmpfs_setattr, .vop_read = tmpfs_read, .vop_write = tmpfs_write, .vop_fsync = tmpfs_fsync, .vop_remove = tmpfs_remove, .vop_link = tmpfs_link, .vop_rename = tmpfs_rename, .vop_mkdir = tmpfs_mkdir, .vop_rmdir = tmpfs_rmdir, .vop_symlink = tmpfs_symlink, .vop_readdir = tmpfs_readdir, .vop_readlink = tmpfs_readlink, .vop_inactive = tmpfs_inactive, .vop_need_inactive = tmpfs_need_inactive, .vop_reclaim = tmpfs_reclaim, .vop_print = tmpfs_print, .vop_pathconf = tmpfs_pathconf, .vop_vptofh = tmpfs_vptofh, .vop_whiteout = tmpfs_whiteout, .vop_bmap = VOP_EOPNOTSUPP, .vop_vptocnp = tmpfs_vptocnp, }; /* * Same vector for mounts which do not use namecache. */ struct vop_vector tmpfs_vnodeop_nonc_entries = { .vop_default = &tmpfs_vnodeop_entries, .vop_lookup = tmpfs_lookup, }; diff --git a/sys/fs/udf/udf_vnops.c b/sys/fs/udf/udf_vnops.c index b1b004f95166..3f0666054f95 100644 --- a/sys/fs/udf/udf_vnops.c +++ b/sys/fs/udf/udf_vnops.c @@ -1,1496 +1,1491 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001, 2002 Scott Long * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* udf_vnops.c */ /* Take care of the vnode side of things */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct iconv_functions *udf_iconv; static vop_access_t udf_access; static vop_getattr_t udf_getattr; static vop_open_t udf_open; static vop_ioctl_t udf_ioctl; static vop_pathconf_t udf_pathconf; static vop_print_t udf_print; static vop_read_t udf_read; static vop_readdir_t udf_readdir; static vop_readlink_t udf_readlink; static vop_setattr_t udf_setattr; static vop_strategy_t udf_strategy; static vop_bmap_t udf_bmap; static vop_cachedlookup_t udf_lookup; static vop_reclaim_t udf_reclaim; static vop_vptofh_t udf_vptofh; static int udf_readatoffset(struct udf_node *node, int *size, off_t offset, struct buf **bp, uint8_t **data); static int udf_bmap_internal(struct udf_node *node, off_t offset, daddr_t *sector, uint32_t *max_size); static struct vop_vector udf_vnodeops = { .vop_default = &default_vnodeops, .vop_access = udf_access, .vop_bmap = udf_bmap, .vop_cachedlookup = udf_lookup, .vop_getattr = udf_getattr, .vop_ioctl = udf_ioctl, .vop_lookup = vfs_cache_lookup, .vop_open = udf_open, .vop_pathconf = udf_pathconf, .vop_print = udf_print, .vop_read = udf_read, .vop_readdir = udf_readdir, .vop_readlink = udf_readlink, .vop_reclaim = udf_reclaim, .vop_setattr = udf_setattr, .vop_strategy = udf_strategy, .vop_vptofh = udf_vptofh, }; struct vop_vector udf_fifoops = { .vop_default = &fifo_specops, .vop_access = udf_access, .vop_getattr = udf_getattr, .vop_pathconf = udf_pathconf, .vop_print = udf_print, .vop_reclaim = udf_reclaim, .vop_setattr = udf_setattr, .vop_vptofh = udf_vptofh, }; static MALLOC_DEFINE(M_UDFFID, "udf_fid", "UDF FileId structure"); static MALLOC_DEFINE(M_UDFDS, "udf_ds", "UDF Dirstream structure"); #define UDF_INVALID_BMAP -1 int udf_allocv(struct mount *mp, struct vnode **vpp, struct thread *td) { int error; struct vnode *vp; error = getnewvnode("udf", mp, &udf_vnodeops, &vp); if (error) { printf("udf_allocv: failed to allocate new vnode\n"); return (error); } *vpp = vp; return (0); } /* Convert file entry permission (5 bits per owner/group/user) to a mode_t */ static mode_t udf_permtomode(struct udf_node *node) { uint32_t perm; uint16_t flags; mode_t mode; perm = le32toh(node->fentry->perm); flags = le16toh(node->fentry->icbtag.flags); mode = perm & UDF_FENTRY_PERM_USER_MASK; mode |= ((perm & UDF_FENTRY_PERM_GRP_MASK) >> 2); mode |= ((perm & UDF_FENTRY_PERM_OWNER_MASK) >> 4); mode |= ((flags & UDF_ICB_TAG_FLAGS_STICKY) << 4); mode |= ((flags & UDF_ICB_TAG_FLAGS_SETGID) << 6); mode |= ((flags & UDF_ICB_TAG_FLAGS_SETUID) << 8); return (mode); } static int udf_access(struct vop_access_args *a) { struct vnode *vp; struct udf_node *node; accmode_t accmode; mode_t mode; vp = a->a_vp; node = VTON(vp); accmode = a->a_accmode; if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: return (EROFS); /* NOT REACHED */ default: break; } } mode = udf_permtomode(node); return (vaccess(vp->v_type, mode, node->fentry->uid, node->fentry->gid, accmode, a->a_cred, NULL)); } static int udf_open(struct vop_open_args *ap) { struct udf_node *np = VTON(ap->a_vp); off_t fsize; fsize = le64toh(np->fentry->inf_len); vnode_create_vobject(ap->a_vp, fsize, ap->a_td); return 0; } static const int mon_lens[2][12] = { {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}, {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335} }; static int udf_isaleapyear(int year) { int i; i = (year % 4) ? 0 : 1; i &= (year % 100) ? 1 : 0; i |= (year % 400) ? 0 : 1; return i; } /* * Timezone calculation compliments of Julian Elischer . */ static void udf_timetotimespec(struct timestamp *time, struct timespec *t) { int i, lpyear, daysinyear, year, startyear; union { uint16_t u_tz_offset; int16_t s_tz_offset; } tz; /* * DirectCD seems to like using bogus year values. * Don't trust time->month as it will be used for an array index. */ year = le16toh(time->year); if (year < 1970 || time->month < 1 || time->month > 12) { t->tv_sec = 0; t->tv_nsec = 0; return; } /* Calculate the time and day */ t->tv_sec = time->second; t->tv_sec += time->minute * 60; t->tv_sec += time->hour * 3600; t->tv_sec += (time->day - 1) * 3600 * 24; /* Calculate the month */ lpyear = udf_isaleapyear(year); t->tv_sec += mon_lens[lpyear][time->month - 1] * 3600 * 24; /* Speed up the calculation */ startyear = 1970; if (year > 2009) { t->tv_sec += 1262304000; startyear += 40; } else if (year > 1999) { t->tv_sec += 946684800; startyear += 30; } else if (year > 1989) { t->tv_sec += 631152000; startyear += 20; } else if (year > 1979) { t->tv_sec += 315532800; startyear += 10; } daysinyear = (year - startyear) * 365; for (i = startyear; i < year; i++) daysinyear += udf_isaleapyear(i); t->tv_sec += daysinyear * 3600 * 24; /* Calculate microseconds */ t->tv_nsec = time->centisec * 10000 + time->hund_usec * 100 + time->usec; /* * Calculate the time zone. The timezone is 12 bit signed 2's * complement, so we gotta do some extra magic to handle it right. */ tz.u_tz_offset = le16toh(time->type_tz); tz.u_tz_offset &= 0x0fff; if (tz.u_tz_offset & 0x0800) tz.u_tz_offset |= 0xf000; /* extend the sign to 16 bits */ if ((le16toh(time->type_tz) & 0x1000) && (tz.s_tz_offset != -2047)) t->tv_sec -= tz.s_tz_offset * 60; return; } static int udf_getattr(struct vop_getattr_args *a) { struct vnode *vp; struct udf_node *node; struct vattr *vap; struct file_entry *fentry; struct timespec ts; ts.tv_sec = 0; vp = a->a_vp; vap = a->a_vap; node = VTON(vp); fentry = node->fentry; vap->va_fsid = dev2udev(node->udfmp->im_dev); vap->va_fileid = node->hash_id; vap->va_mode = udf_permtomode(node); vap->va_nlink = le16toh(fentry->link_cnt); /* * XXX The spec says that -1 is valid for uid/gid and indicates an * invalid uid/gid. How should this be represented? */ vap->va_uid = (le32toh(fentry->uid) == -1) ? 0 : le32toh(fentry->uid); vap->va_gid = (le32toh(fentry->gid) == -1) ? 0 : le32toh(fentry->gid); udf_timetotimespec(&fentry->atime, &vap->va_atime); udf_timetotimespec(&fentry->mtime, &vap->va_mtime); vap->va_ctime = vap->va_mtime; /* XXX Stored as an Extended Attribute */ vap->va_rdev = NODEV; if (vp->v_type & VDIR) { /* * Directories that are recorded within their ICB will show * as having 0 blocks recorded. Since tradition dictates * that directories consume at least one logical block, * make it appear so. */ if (fentry->logblks_rec != 0) { vap->va_size = le64toh(fentry->logblks_rec) * node->udfmp->bsize; } else { vap->va_size = node->udfmp->bsize; } } else { vap->va_size = le64toh(fentry->inf_len); } vap->va_flags = 0; vap->va_gen = 1; vap->va_blocksize = node->udfmp->bsize; vap->va_bytes = le64toh(fentry->inf_len); vap->va_type = vp->v_type; vap->va_filerev = 0; /* XXX */ return (0); } static int udf_setattr(struct vop_setattr_args *a) { struct vnode *vp; struct vattr *vap; vp = a->a_vp; vap = a->a_vap; if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) return (EROFS); if (vap->va_size != (u_quad_t)VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: return (EROFS); case VCHR: case VBLK: case VSOCK: case VFIFO: case VNON: case VBAD: case VMARKER: return (0); } } return (0); } /* * File specific ioctls. */ static int udf_ioctl(struct vop_ioctl_args *a) { printf("%s called\n", __func__); return (ENOTTY); } /* * I'm not sure that this has much value in a read-only filesystem, but * cd9660 has it too. */ static int udf_pathconf(struct vop_pathconf_args *a) { switch (a->a_name) { case _PC_FILESIZEBITS: *a->a_retval = 64; return (0); case _PC_LINK_MAX: *a->a_retval = 65535; return (0); case _PC_NAME_MAX: *a->a_retval = NAME_MAX; return (0); case _PC_SYMLINK_MAX: *a->a_retval = MAXPATHLEN; return (0); case _PC_NO_TRUNC: *a->a_retval = 1; return (0); case _PC_PIPE_BUF: if (a->a_vp->v_type == VDIR || a->a_vp->v_type == VFIFO) { *a->a_retval = PIPE_BUF; return (0); } return (EINVAL); default: return (vop_stdpathconf(a)); } } static int udf_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; struct udf_node *node = VTON(vp); printf(" ino %lu, on dev %s", (u_long)node->hash_id, devtoname(node->udfmp->im_dev)); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } #define lblkno(udfmp, loc) ((loc) >> (udfmp)->bshift) #define blkoff(udfmp, loc) ((loc) & (udfmp)->bmask) #define lblktosize(udfmp, blk) ((blk) << (udfmp)->bshift) static inline int is_data_in_fentry(const struct udf_node *node) { const struct file_entry *fentry = node->fentry; return ((le16toh(fentry->icbtag.flags) & 0x7) == 3); } static int udf_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct udf_node *node = VTON(vp); struct udf_mnt *udfmp; struct file_entry *fentry; struct buf *bp; uint8_t *data; daddr_t lbn, rablock; off_t diff, fsize; ssize_t n; int error = 0; long size, on; if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); if (is_data_in_fentry(node)) { fentry = node->fentry; data = &fentry->data[le32toh(fentry->l_ea)]; fsize = le32toh(fentry->l_ad); n = uio->uio_resid; diff = fsize - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; error = uiomove(data + uio->uio_offset, (int)n, uio); return (error); } fsize = le64toh(node->fentry->inf_len); udfmp = node->udfmp; do { lbn = lblkno(udfmp, uio->uio_offset); on = blkoff(udfmp, uio->uio_offset); n = min((u_int)(udfmp->bsize - on), uio->uio_resid); diff = fsize - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; size = udfmp->bsize; rablock = lbn + 1; if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { if (lblktosize(udfmp, rablock) < fsize) { error = cluster_read(vp, fsize, lbn, size, NOCRED, uio->uio_resid, (ap->a_ioflag >> 16), 0, &bp); } else { error = bread(vp, lbn, size, NOCRED, &bp); } } else { error = bread(vp, lbn, size, NOCRED, &bp); } if (error != 0) { brelse(bp); return (error); } n = min(n, size - bp->b_resid); error = uiomove(bp->b_data + on, (int)n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); } /* * Call the OSTA routines to translate the name from a CS0 dstring to a * 16-bit Unicode String. Hooks need to be placed in here to translate from * Unicode to the encoding that the kernel/user expects. Return the length * of the translated string. */ static int udf_transname(char *cs0string, char *destname, int len, struct udf_mnt *udfmp) { unicode_t *transname; char *unibuf, *unip; int i, destlen; ssize_t unilen = 0; size_t destleft = MAXNAMLEN; /* Convert 16-bit Unicode to destname */ if (udfmp->im_flags & UDFMNT_KICONV && udf_iconv) { /* allocate a buffer big enough to hold an 8->16 bit expansion */ unibuf = uma_zalloc(udf_zone_trans, M_WAITOK); unip = unibuf; if ((unilen = (ssize_t)udf_UncompressUnicodeByte(len, cs0string, unibuf)) == -1) { printf("udf: Unicode translation failed\n"); uma_zfree(udf_zone_trans, unibuf); return 0; } while (unilen > 0 && destleft > 0) { udf_iconv->conv(udfmp->im_d2l, __DECONST(const char **, &unibuf), (size_t *)&unilen, (char **)&destname, &destleft); /* Unconverted character found */ if (unilen > 0 && destleft > 0) { *destname++ = '?'; destleft--; unibuf += 2; unilen -= 2; } } uma_zfree(udf_zone_trans, unip); *destname = '\0'; destlen = MAXNAMLEN - (int)destleft; } else { /* allocate a buffer big enough to hold an 8->16 bit expansion */ transname = uma_zalloc(udf_zone_trans, M_WAITOK); if ((unilen = (ssize_t)udf_UncompressUnicode(len, cs0string, transname)) == -1) { printf("udf: Unicode translation failed\n"); uma_zfree(udf_zone_trans, transname); return 0; } for (i = 0; i < unilen ; i++) { if (transname[i] & 0xff00) { destname[i] = '.'; /* Fudge the 16bit chars */ } else { destname[i] = transname[i] & 0xff; } } uma_zfree(udf_zone_trans, transname); destname[unilen] = 0; destlen = (int)unilen; } return (destlen); } /* * Compare a CS0 dstring with a name passed in from the VFS layer. Return * 0 on a successful match, nonzero otherwise. Unicode work may need to be done * here also. */ static int udf_cmpname(char *cs0string, char *cmpname, int cs0len, int cmplen, struct udf_mnt *udfmp) { char *transname; int error = 0; /* This is overkill, but not worth creating a new zone */ transname = uma_zalloc(udf_zone_trans, M_WAITOK); cs0len = udf_transname(cs0string, transname, cs0len, udfmp); /* Easy check. If they aren't the same length, they aren't equal */ if ((cs0len == 0) || (cs0len != cmplen)) error = -1; else error = bcmp(transname, cmpname, cmplen); uma_zfree(udf_zone_trans, transname); return (error); } struct udf_uiodir { struct dirent *dirent; u_long *cookies; int ncookies; int acookies; int eofflag; }; static int udf_uiodir(struct udf_uiodir *uiodir, int de_size, struct uio *uio, long cookie) { if (uiodir->cookies != NULL) { if (++uiodir->acookies > uiodir->ncookies) { uiodir->eofflag = 0; return (-1); } *uiodir->cookies++ = cookie; } if (uio->uio_resid < de_size) { uiodir->eofflag = 0; return (-1); } return (uiomove(uiodir->dirent, de_size, uio)); } static struct udf_dirstream * udf_opendir(struct udf_node *node, int offset, int fsize, struct udf_mnt *udfmp) { struct udf_dirstream *ds; ds = uma_zalloc(udf_zone_ds, M_WAITOK | M_ZERO); ds->node = node; ds->offset = offset; ds->udfmp = udfmp; ds->fsize = fsize; return (ds); } static struct fileid_desc * udf_getfid(struct udf_dirstream *ds) { struct fileid_desc *fid; int error, frag_size = 0, total_fid_size; /* End of directory? */ if (ds->offset + ds->off >= ds->fsize) { ds->error = 0; return (NULL); } /* Grab the first extent of the directory */ if (ds->off == 0) { ds->size = 0; error = udf_readatoffset(ds->node, &ds->size, ds->offset, &ds->bp, &ds->data); if (error) { ds->error = error; if (ds->bp != NULL) brelse(ds->bp); return (NULL); } } /* * Clean up from a previous fragmented FID. * XXX Is this the right place for this? */ if (ds->fid_fragment && ds->buf != NULL) { ds->fid_fragment = 0; free(ds->buf, M_UDFFID); } fid = (struct fileid_desc*)&ds->data[ds->off]; /* * Check to see if the fid is fragmented. The first test * ensures that we don't wander off the end of the buffer * looking for the l_iu and l_fi fields. */ if (ds->off + UDF_FID_SIZE > ds->size || ds->off + le16toh(fid->l_iu) + fid->l_fi + UDF_FID_SIZE > ds->size){ /* Copy what we have of the fid into a buffer */ frag_size = ds->size - ds->off; if (frag_size >= ds->udfmp->bsize) { printf("udf: invalid FID fragment\n"); ds->error = EINVAL; return (NULL); } /* * File ID descriptors can only be at most one * logical sector in size. */ ds->buf = malloc(ds->udfmp->bsize, M_UDFFID, M_WAITOK | M_ZERO); bcopy(fid, ds->buf, frag_size); /* Reduce all of the casting magic */ fid = (struct fileid_desc*)ds->buf; if (ds->bp != NULL) brelse(ds->bp); /* Fetch the next allocation */ ds->offset += ds->size; ds->size = 0; error = udf_readatoffset(ds->node, &ds->size, ds->offset, &ds->bp, &ds->data); if (error) { ds->error = error; return (NULL); } /* * If the fragment was so small that we didn't get * the l_iu and l_fi fields, copy those in. */ if (frag_size < UDF_FID_SIZE) bcopy(ds->data, &ds->buf[frag_size], UDF_FID_SIZE - frag_size); /* * Now that we have enough of the fid to work with, * copy in the rest of the fid from the new * allocation. */ total_fid_size = UDF_FID_SIZE + le16toh(fid->l_iu) + fid->l_fi; if (total_fid_size > ds->udfmp->bsize) { printf("udf: invalid FID\n"); ds->error = EIO; return (NULL); } bcopy(ds->data, &ds->buf[frag_size], total_fid_size - frag_size); ds->fid_fragment = 1; } else { total_fid_size = le16toh(fid->l_iu) + fid->l_fi + UDF_FID_SIZE; } /* * Update the offset. Align on a 4 byte boundary because the * UDF spec says so. */ ds->this_off = ds->offset + ds->off; if (!ds->fid_fragment) { ds->off += (total_fid_size + 3) & ~0x03; } else { ds->off = (total_fid_size - frag_size + 3) & ~0x03; } return (fid); } static void udf_closedir(struct udf_dirstream *ds) { if (ds->bp != NULL) brelse(ds->bp); if (ds->fid_fragment && ds->buf != NULL) free(ds->buf, M_UDFFID); uma_zfree(udf_zone_ds, ds); } static int udf_readdir(struct vop_readdir_args *a) { struct vnode *vp; struct uio *uio; struct dirent dir; struct udf_node *node; struct udf_mnt *udfmp; struct fileid_desc *fid; struct udf_uiodir uiodir; struct udf_dirstream *ds; u_long *cookies = NULL; int ncookies; int error = 0; vp = a->a_vp; uio = a->a_uio; node = VTON(vp); udfmp = node->udfmp; uiodir.eofflag = 1; if (a->a_ncookies != NULL) { /* * Guess how many entries are needed. If we run out, this * function will be called again and thing will pick up were * it left off. */ ncookies = uio->uio_resid / 8; cookies = malloc(sizeof(u_long) * ncookies, M_TEMP, M_WAITOK); if (cookies == NULL) return (ENOMEM); uiodir.ncookies = ncookies; uiodir.cookies = cookies; uiodir.acookies = 0; } else { uiodir.cookies = NULL; } /* * Iterate through the file id descriptors. Give the parent dir * entry special attention. */ ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len), node->udfmp); while ((fid = udf_getfid(ds)) != NULL) { /* XXX Should we return an error on a bad fid? */ if (udf_checktag(&fid->tag, TAGID_FID)) { printf("Invalid FID tag\n"); hexdump(fid, UDF_FID_SIZE, NULL, 0); error = EIO; break; } /* Is this a deleted file? */ if (fid->file_char & UDF_FILE_CHAR_DEL) continue; if ((fid->l_fi == 0) && (fid->file_char & UDF_FILE_CHAR_PAR)) { /* Do up the '.' and '..' entries. Dummy values are * used for the cookies since the offset here is * usually zero, and NFS doesn't like that value */ dir.d_fileno = node->hash_id; dir.d_type = DT_DIR; dir.d_name[0] = '.'; dir.d_namlen = 1; dir.d_reclen = GENERIC_DIRSIZ(&dir); dir.d_off = 1; dirent_terminate(&dir); uiodir.dirent = &dir; error = udf_uiodir(&uiodir, dir.d_reclen, uio, 1); if (error) break; dir.d_fileno = udf_getid(&fid->icb); dir.d_type = DT_DIR; dir.d_name[0] = '.'; dir.d_name[1] = '.'; dir.d_namlen = 2; dir.d_reclen = GENERIC_DIRSIZ(&dir); dir.d_off = 2; dirent_terminate(&dir); uiodir.dirent = &dir; error = udf_uiodir(&uiodir, dir.d_reclen, uio, 2); } else { dir.d_namlen = udf_transname(&fid->data[fid->l_iu], &dir.d_name[0], fid->l_fi, udfmp); dir.d_fileno = udf_getid(&fid->icb); dir.d_type = (fid->file_char & UDF_FILE_CHAR_DIR) ? DT_DIR : DT_UNKNOWN; dir.d_reclen = GENERIC_DIRSIZ(&dir); dir.d_off = ds->this_off; dirent_terminate(&dir); uiodir.dirent = &dir; error = udf_uiodir(&uiodir, dir.d_reclen, uio, ds->this_off); } if (error) break; uio->uio_offset = ds->offset + ds->off; } /* tell the calling layer whether we need to be called again */ *a->a_eofflag = uiodir.eofflag; if (error < 0) error = 0; if (!error) error = ds->error; udf_closedir(ds); if (a->a_ncookies != NULL) { if (error) free(cookies, M_TEMP); else { *a->a_ncookies = uiodir.acookies; *a->a_cookies = cookies; } } return (error); } static int udf_readlink(struct vop_readlink_args *ap) { struct path_component *pc, *end; struct vnode *vp; struct uio uio; struct iovec iov[1]; struct udf_node *node; void *buf; char *cp; int error, len, root; /* * A symbolic link in UDF is a list of variable-length path * component structures. We build a pathname in the caller's * uio by traversing this list. */ vp = ap->a_vp; node = VTON(vp); len = le64toh(node->fentry->inf_len); buf = malloc(len, M_DEVBUF, M_WAITOK); iov[0].iov_len = len; iov[0].iov_base = buf; uio.uio_iov = iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = iov[0].iov_len; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; error = VOP_READ(vp, &uio, 0, ap->a_cred); if (error) goto error; pc = buf; end = (void *)((char *)buf + len); root = 0; while (pc < end) { switch (pc->type) { case UDF_PATH_ROOT: /* Only allow this at the beginning of a path. */ if ((void *)pc != buf) { error = EINVAL; goto error; } cp = "/"; len = 1; root = 1; break; case UDF_PATH_DOT: cp = "."; len = 1; break; case UDF_PATH_DOTDOT: cp = ".."; len = 2; break; case UDF_PATH_PATH: if (pc->length == 0) { error = EINVAL; goto error; } /* * XXX: We only support CS8 which appears to map * to ASCII directly. */ switch (pc->identifier[0]) { case 8: cp = pc->identifier + 1; len = pc->length - 1; break; default: error = EOPNOTSUPP; goto error; } break; default: error = EINVAL; goto error; } /* * If this is not the first component, insert a path * separator. */ if (pc != buf) { /* If we started with root we already have a "/". */ if (root) goto skipslash; root = 0; if (ap->a_uio->uio_resid < 1) { error = ENAMETOOLONG; goto error; } error = uiomove("/", 1, ap->a_uio); if (error) break; } skipslash: /* Append string at 'cp' of length 'len' to our path. */ if (len > ap->a_uio->uio_resid) { error = ENAMETOOLONG; goto error; } error = uiomove(cp, len, ap->a_uio); if (error) break; /* Advance to next component. */ pc = (void *)((char *)pc + 4 + pc->length); } error: free(buf, M_DEVBUF); return (error); } static int udf_strategy(struct vop_strategy_args *a) { struct buf *bp; struct vnode *vp; struct udf_node *node; struct bufobj *bo; off_t offset; uint32_t maxsize; daddr_t sector; int error; bp = a->a_bp; vp = a->a_vp; node = VTON(vp); if (bp->b_blkno == bp->b_lblkno) { offset = lblktosize(node->udfmp, bp->b_lblkno); error = udf_bmap_internal(node, offset, §or, &maxsize); if (error) { clrbuf(bp); bp->b_blkno = -1; bufdone(bp); return (0); } /* bmap gives sector numbers, bio works with device blocks */ bp->b_blkno = sector << (node->udfmp->bshift - DEV_BSHIFT); } bo = node->udfmp->im_bo; bp->b_iooffset = dbtob(bp->b_blkno); BO_STRATEGY(bo, bp); return (0); } static int udf_bmap(struct vop_bmap_args *a) { struct udf_node *node; uint32_t max_size; daddr_t lsector; int nblk; int error; node = VTON(a->a_vp); if (a->a_bop != NULL) *a->a_bop = &node->udfmp->im_devvp->v_bufobj; if (a->a_bnp == NULL) return (0); if (a->a_runb) *a->a_runb = 0; /* * UDF_INVALID_BMAP means data embedded into fentry, this is an internal * error that should not be propagated to calling code. * Most obvious mapping for this error is EOPNOTSUPP as we can not truly * translate block numbers in this case. * Incidentally, this return code will make vnode pager to use VOP_READ * to get data for mmap-ed pages and udf_read knows how to do the right * thing for this kind of files. */ error = udf_bmap_internal(node, a->a_bn << node->udfmp->bshift, &lsector, &max_size); if (error == UDF_INVALID_BMAP) return (EOPNOTSUPP); if (error) return (error); /* Translate logical to physical sector number */ *a->a_bnp = lsector << (node->udfmp->bshift - DEV_BSHIFT); /* * Determine maximum number of readahead blocks following the * requested block. */ if (a->a_runp) { nblk = (max_size >> node->udfmp->bshift) - 1; if (nblk <= 0) *a->a_runp = 0; else if (nblk >= (MAXBSIZE >> node->udfmp->bshift)) *a->a_runp = (MAXBSIZE >> node->udfmp->bshift) - 1; else *a->a_runp = nblk; } if (a->a_runb) { *a->a_runb = 0; } return (0); } /* * The all powerful VOP_LOOKUP(). */ static int udf_lookup(struct vop_cachedlookup_args *a) { struct vnode *dvp; struct vnode *tdp = NULL; struct vnode **vpp = a->a_vpp; struct udf_node *node; struct udf_mnt *udfmp; struct fileid_desc *fid = NULL; struct udf_dirstream *ds; u_long nameiop; u_long flags; char *nameptr; long namelen; ino_t id = 0; int offset, error = 0; int fsize, lkflags, ltype, numdirpasses; dvp = a->a_dvp; node = VTON(dvp); udfmp = node->udfmp; nameiop = a->a_cnp->cn_nameiop; flags = a->a_cnp->cn_flags; lkflags = a->a_cnp->cn_lkflags; nameptr = a->a_cnp->cn_nameptr; namelen = a->a_cnp->cn_namelen; fsize = le64toh(node->fentry->inf_len); /* * If this is a LOOKUP and we've already partially searched through * the directory, pick up where we left off and flag that the * directory may need to be searched twice. For a full description, * see /sys/fs/cd9660/cd9660_lookup.c:cd9660_lookup() */ if (nameiop != LOOKUP || node->diroff == 0 || node->diroff > fsize) { offset = 0; numdirpasses = 1; } else { offset = node->diroff; numdirpasses = 2; nchstats.ncs_2passes++; } lookloop: ds = udf_opendir(node, offset, fsize, udfmp); while ((fid = udf_getfid(ds)) != NULL) { /* XXX Should we return an error on a bad fid? */ if (udf_checktag(&fid->tag, TAGID_FID)) { printf("udf_lookup: Invalid tag\n"); error = EIO; break; } /* Is this a deleted file? */ if (fid->file_char & UDF_FILE_CHAR_DEL) continue; if ((fid->l_fi == 0) && (fid->file_char & UDF_FILE_CHAR_PAR)) { if (flags & ISDOTDOT) { id = udf_getid(&fid->icb); break; } } else { if (!(udf_cmpname(&fid->data[fid->l_iu], nameptr, fid->l_fi, namelen, udfmp))) { id = udf_getid(&fid->icb); break; } } } if (!error) error = ds->error; /* XXX Bail out here? */ if (error) { udf_closedir(ds); return (error); } /* Did we have a match? */ if (id) { /* * Remember where this entry was if it's the final * component. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) node->diroff = ds->offset + ds->off; if (numdirpasses == 2) nchstats.ncs_pass2++; udf_closedir(ds); if (flags & ISDOTDOT) { error = vn_vget_ino(dvp, id, lkflags, &tdp); } else if (node->hash_id == id) { VREF(dvp); /* we want ourself, ie "." */ /* * When we lookup "." we still can be asked to lock it * differently. */ ltype = lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); } tdp = dvp; } else error = udf_vget(udfmp->im_mountp, id, lkflags, &tdp); if (!error) { *vpp = tdp; /* Put this entry in the cache */ if (flags & MAKEENTRY) cache_enter(dvp, *vpp, a->a_cnp); } } else { /* Name wasn't found on this pass. Do another pass? */ if (numdirpasses == 2) { numdirpasses--; offset = 0; udf_closedir(ds); goto lookloop; } udf_closedir(ds); /* Enter name into cache as non-existant */ if (flags & MAKEENTRY) cache_enter(dvp, *vpp, a->a_cnp); if ((flags & ISLASTCN) && (nameiop == CREATE || nameiop == RENAME)) { error = EROFS; } else { error = ENOENT; } } return (error); } static int udf_reclaim(struct vop_reclaim_args *a) { struct vnode *vp; struct udf_node *unode; vp = a->a_vp; unode = VTON(vp); - /* - * Destroy the vm object and flush associated pages. - */ - vnode_destroy_vobject(vp); - if (unode != NULL) { vfs_hash_remove(vp); if (unode->fentry != NULL) free(unode->fentry, M_UDFFENTRY); uma_zfree(udf_zone_node, unode); vp->v_data = NULL; } return (0); } static int udf_vptofh(struct vop_vptofh_args *a) { struct udf_node *node; struct ifid *ifhp; node = VTON(a->a_vp); ifhp = (struct ifid *)a->a_fhp; ifhp->ifid_len = sizeof(struct ifid); ifhp->ifid_ino = node->hash_id; return (0); } /* * Read the block and then set the data pointer to correspond with the * offset passed in. Only read in at most 'size' bytes, and then set 'size' * to the number of bytes pointed to. If 'size' is zero, try to read in a * whole extent. * * Note that *bp may be assigned error or not. * */ static int udf_readatoffset(struct udf_node *node, int *size, off_t offset, struct buf **bp, uint8_t **data) { struct udf_mnt *udfmp = node->udfmp; struct vnode *vp = node->i_vnode; struct file_entry *fentry; struct buf *bp1; uint32_t max_size; daddr_t sector; off_t off; int adj_size; int error; /* * This call is made *not* only to detect UDF_INVALID_BMAP case, * max_size is used as an ad-hoc read-ahead hint for "normal" case. */ error = udf_bmap_internal(node, offset, §or, &max_size); if (error == UDF_INVALID_BMAP) { /* * This error means that the file *data* is stored in the * allocation descriptor field of the file entry. */ fentry = node->fentry; *data = &fentry->data[le32toh(fentry->l_ea)]; *size = le32toh(fentry->l_ad); if (offset >= *size) *size = 0; else { *data += offset; *size -= offset; } return (0); } else if (error != 0) { return (error); } /* Adjust the size so that it is within range */ if (*size == 0 || *size > max_size) *size = max_size; /* * Because we will read starting at block boundary, we need to adjust * how much we need to read so that all promised data is in. * Also, we can't promise to read more than MAXBSIZE bytes starting * from block boundary, so adjust what we promise too. */ off = blkoff(udfmp, offset); *size = min(*size, MAXBSIZE - off); adj_size = (*size + off + udfmp->bmask) & ~udfmp->bmask; *bp = NULL; if ((error = bread(vp, lblkno(udfmp, offset), adj_size, NOCRED, bp))) { printf("warning: udf_readlblks returned error %d\n", error); /* note: *bp may be non-NULL */ return (error); } bp1 = *bp; *data = (uint8_t *)&bp1->b_data[offset & udfmp->bmask]; return (0); } /* * Translate a file offset into a logical block and then into a physical * block. * max_size - maximum number of bytes that can be read starting from given * offset, rather than beginning of calculated sector number */ static int udf_bmap_internal(struct udf_node *node, off_t offset, daddr_t *sector, uint32_t *max_size) { struct udf_mnt *udfmp; struct file_entry *fentry; void *icb; struct icb_tag *tag; uint32_t icblen = 0; daddr_t lsector; int ad_offset, ad_num = 0; int i, p_offset; udfmp = node->udfmp; fentry = node->fentry; tag = &fentry->icbtag; switch (le16toh(tag->strat_type)) { case 4: break; case 4096: printf("Cannot deal with strategy4096 yet!\n"); return (ENODEV); default: printf("Unknown strategy type %d\n", tag->strat_type); return (ENODEV); } switch (le16toh(tag->flags) & 0x7) { case 0: /* * The allocation descriptor field is filled with short_ad's. * If the offset is beyond the current extent, look for the * next extent. */ do { offset -= icblen; ad_offset = sizeof(struct short_ad) * ad_num; if (ad_offset > le32toh(fentry->l_ad)) { printf("File offset out of bounds\n"); return (EINVAL); } icb = GETICB(short_ad, fentry, le32toh(fentry->l_ea) + ad_offset); icblen = GETICBLEN(short_ad, icb); ad_num++; } while(offset >= icblen); lsector = (offset >> udfmp->bshift) + le32toh(((struct short_ad *)(icb))->pos); *max_size = icblen - offset; break; case 1: /* * The allocation descriptor field is filled with long_ad's * If the offset is beyond the current extent, look for the * next extent. */ do { offset -= icblen; ad_offset = sizeof(struct long_ad) * ad_num; if (ad_offset > le32toh(fentry->l_ad)) { printf("File offset out of bounds\n"); return (EINVAL); } icb = GETICB(long_ad, fentry, le32toh(fentry->l_ea) + ad_offset); icblen = GETICBLEN(long_ad, icb); ad_num++; } while(offset >= icblen); lsector = (offset >> udfmp->bshift) + le32toh(((struct long_ad *)(icb))->loc.lb_num); *max_size = icblen - offset; break; case 3: /* * This type means that the file *data* is stored in the * allocation descriptor field of the file entry. */ *max_size = 0; *sector = node->hash_id + udfmp->part_start; return (UDF_INVALID_BMAP); case 2: /* DirectCD does not use extended_ad's */ default: printf("Unsupported allocation descriptor %d\n", tag->flags & 0x7); return (ENODEV); } *sector = lsector + udfmp->part_start; /* * Check the sparing table. Each entry represents the beginning of * a packet. */ if (udfmp->s_table != NULL) { for (i = 0; i< udfmp->s_table_entries; i++) { p_offset = lsector - le32toh(udfmp->s_table->entries[i].org); if ((p_offset < udfmp->p_sectors) && (p_offset >= 0)) { *sector = le32toh(udfmp->s_table->entries[i].map) + p_offset; break; } } } return (0); } diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 1af4aa028833..b6ec815fca07 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,5767 +1,5779 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void v_init_counters(struct vnode *); static void v_incr_usecount(struct vnode *); static void v_incr_usecount_locked(struct vnode *); static void v_incr_devcount(struct vnode *); static void v_decr_devcount(struct vnode *); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_locked(void *arg); static void vfs_knl_assert_unlocked(void *arg); static void vnlru_return_batches(struct vfsops *mnt_op); static void destroy_vpollinfo(struct vpollinfo *vi); static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn); /* * These fences are intended for cases where some synchronization is * needed between access of v_iflags and lockless vnode refcount (v_holdcnt * and v_usecount) updates. Access to v_iflags is generally synchronized * by the interlock, but we have some internal assertions that check vnode * flags without acquiring the lock. Thus, these fences are INVARIANTS-only * for now. */ #ifdef INVARIANTS #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() #else #define VNODE_REFCOUNT_FENCE_ACQ() #define VNODE_REFCOUNT_FENCE_REL() #endif /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. */ static unsigned long numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); static counter_u64_t vnodes_created; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, "Number of vnodes created by getnewvnode"); static u_long mnt_free_list_batch = 128; SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW, &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * "Free" vnode target. Free vnodes are rarely completely free, but are * just ones that are cheap to recycle. Usually they are for files which * have been stat'd but not read; these usually have inode and namecache * data attached to them. This target is the preferred minimum size of a * sub-cache consisting mostly of such files. The system balances the size * of this sub-cache with its complement to try to prevent either from * thrashing while the other is relatively inactive. The targets express * a preference for the best balance. * * "Above" this target there are 2 further targets (watermarks) related * to recyling of free vnodes. In the best-operating case, the cache is * exactly full, the free list has size between vlowat and vhiwat above the * free target, and recycling from it and normal use maintains this state. * Sometimes the free list is below vlowat or even empty, but this state * is even better for immediate use provided the cache is not full. * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free * ones) to reach one of these states. The watermarks are currently hard- * coded as 4% and 9% of the available space higher. These and the default * of 25% for wantfreevnodes are too large if the memory size is large. * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim * whenever vnlru_proc() becomes active. */ static u_long wantfreevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); static u_long freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of \"free\" vnodes"); static counter_u64_t recycles_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, "Number of vnodes recycled to meet vnode cache targets"); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "Number of calls to reassignbuf"); static counter_u64_t free_owe_inact; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, "Number of times free vnodes kept on active list due to VFS " "owing inactivation"); /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_free_list * numvnodes * freevnodes */ static struct mtx vnode_free_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* Target for maximum number of vnodes. */ int desiredvnodes; static int gapvnodes; /* gap between wanted and desired */ static int vhiwat; /* enough extras after expansion */ static int vlowat; /* minimal extras before expansion */ static int vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static int sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) { int error, old_desiredvnodes; old_desiredvnodes = desiredvnodes; if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) return (error); if (old_desiredvnodes != desiredvnodes) { wantfreevnodes = desiredvnodes / 4; /* XXX locking seems to be incomplete. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); } return (0); } SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); static int sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct vnode *vp; struct nameidata nd; char *buf; unsigned long ndflags; int error; if (req->newptr == NULL) return (EINVAL); if (req->newlen > PATH_MAX) return (E2BIG); buf = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) goto out; buf[req->newlen] = '\0'; ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME; NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; if ((vp->v_iflag & VI_DOOMED) != 0) { /* * This vnode is being recycled. Return != 0 to let the caller * know that the sysctl had no effect. Return EAGAIN because a * subsequent call will likely succeed (since namei will create * a new vnode if necessary) */ error = EAGAIN; goto putvnode; } counter_u64_add(recycles_count, 1); vgone(vp); putvnode: NDFREE(&nd, 0); out: free(buf, M_TEMP); return (error); } static int sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct vnode *vp; struct file *fp; int error; int fd; if (req->newptr == NULL) return (EBADF); error = sysctl_handle_int(oidp, &fd, 0, req); if (error != 0) return (error); error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) goto drop; counter_u64_add(recycles_count, 1); vgone(vp); VOP_UNLOCK(vp, 0); drop: fdrop(fp, td); return (error); } SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_ftry_reclaim_vnode, "I", "Try to reclaim a vnode by its file descriptor"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ static int vnsz2log; /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return uma_zalloc(buf_trie_zone, M_NOWAIT); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree(buf_trie_zone, node); } PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of the memory size in KB to vnodes approaches 64:1. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ #endif /* * Initialize a vnode as it first enters the zone. */ static int vnode_init(void *mem, int size, int flags) { struct vnode *vp; vp = mem; bzero(vp, size); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems opt-in. */ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bufobj_init(&vp->v_bufobj, vp); /* * Initialize namecache. */ LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); /* * Initialize rangelocks. */ rangelock_init(&vp->v_rl); return (0); } /* * Free a vnode when it is cleared from the zone. */ static void vnode_fini(void *mem, int size) { struct vnode *vp; struct bufobj *bo; vp = mem; rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); bo = &vp->v_bufobj; rw_destroy(BO_LOCKPTR(bo)); } /* * Provide the size of NFS nclnode and NFS fh for calculation of the * vnode memory consumption. The size is specified directly to * eliminate dependency on NFS-private header. * * Other filesystems may use bigger or smaller (like UFS and ZFS) * private inode data, but the NFS-based estimation is ample enough. * Still, we care about differences in the size between 64- and 32-bit * platforms. * * Namecache structure size is heuristically * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. */ #ifdef _LP64 #define NFS_NCLNODE_SZ (528 + 64) #define NC_SZ 148 #else #define NFS_NCLNODE_SZ (360 + 32) #define NC_SZ 92 #endif static void vntblinit(void *dummy __unused) { u_int i; int physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to the physical * memory size is 1:16 until desiredvnodes exceeds 98,304. * Thereafter, the * marginal ratio of desiredvnodes to the physical memory size is * 1:64. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * must not exceed 1/10th of the kernel's heap size. */ physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %d -> %d\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); uma_prealloc(buf_trie_zone, nbuf); vnodes_created = counter_u64_alloc(M_WAITOK); recycles_count = counter_u64_alloc(M_WAITOK); free_owe_inact = counter_u64_alloc(M_WAITOK); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); for (i = 1; i <= sizeof(struct vnode); i <<= 1) vnsz2log++; vnsz2log--; } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * VOP_LOOKUP() obtains B while A is held * vfs_busy() obtains a shared lock on F while A and B are held * vput() releases lock on B * vput() releases lock on A * VFS_ROOT() obtains lock on D while shared lock on F is held * vfs_unbusy() releases shared lock on F * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. */ int vfs_busy(struct mount *mp, int flags) { MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); MNT_ILOCK(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller that mount mount it tried to busy is no longer * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_REL(mp); KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); mp->mnt_lockref--; if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; if (jailed(td->td_ucred)) { /* * If the jail of the calling thread lacks permission for * this type of file system, deny immediately. */ if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) return (EPERM); /* * If the file system was mounted outside the jail of the * calling thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desirable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. * * @param mp Try to reclaim vnodes from this mountpoint * @param reclaim_nc_src Only reclaim directories with outgoing namecache * entries if this argument is strue * @param trigger Only reclaim vnodes with fewer than this many resident * pages. * @return The number of vnodes that were reclaimed. */ static int vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger) { struct vnode *vp; int count, done, target; done = 0; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); count = mp->mnt_nvnodelistsize; target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); target = target / 10 + 1; while (count != 0 && done < target) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; /* * XXX LRU is completely broken for non-free vnodes. First * by calling here in mountpoint order, then by moving * unselected vnodes to the end here, and most grossly by * removing the vlruvp() function that was supposed to * maintain the order. (This function was born broken * since syncer problems prevented it doing anything.) The * order is closer to LRC (C = Created). * * LRU reclaiming of vnodes seems to have last worked in * FreeBSD-3 where LRU wasn't mentioned under any spelling. * Then there was no hold count, and inactive vnodes were * simply put on the free list in LRU order. The separate * lists also break LRU. We prefer to reclaim from the * free list for technical reasons. This tends to thrash * the free list to keep very unrecently used held vnodes. * The problem is mitigated by keeping the free list large. */ TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; if (!VI_TRYLOCK(vp)) goto next_iter; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. * Also skip free vnodes. We are trying to make space * to expand the free list, not reduce it. */ if (vp->v_usecount || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || ((vp->v_iflag & VI_FREE) != 0) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); goto next_iter; } MNT_IUNLOCK(mp); vholdl(vp); if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { vdrop(vp); goto next_iter_mntunlocked; } VI_LOCK(vp); /* * v_usecount may have been bumped after VOP_LOCK() dropped * the vnode interlock and before it was locked again. * * It is not necessary to recheck VI_DOOMED because it can * only be set by another thread that holds both the vnode * lock and vnode interlock. If another thread has the * vnode lock before we get to VOP_LOCK() and obtains the * vnode interlock after VOP_LOCK() drops the vnode * interlock, the other thread will be unable to drop the * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || (vp->v_iflag & VI_FREE) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, 0); vdropl(vp); goto next_iter_mntunlocked; } KASSERT((vp->v_iflag & VI_DOOMED) == 0, ("VI_DOOMED unexpectedly detected in vlrureclaim()")); counter_u64_add(recycles_count, 1); vgonel(vp); VOP_UNLOCK(vp, 0); vdropl(vp); done++; next_iter_mntunlocked: if (!should_yield()) goto relock_mnt; goto yield; next_iter: if (!should_yield()) continue; MNT_IUNLOCK(mp); yield: kern_yield(PRI_USER); relock_mnt: MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_write(mp); return done; } static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 0, "limit on vnode free requests per call to the vnlru_free routine"); /* * Attempt to reduce the free list by the requested amount. */ static void vnlru_free_locked(int count, struct vfsops *mnt_op) { struct vnode *vp; struct mount *mp; bool tried_batches; tried_batches = false; mtx_assert(&vnode_free_list_mtx, MA_OWNED); if (count > max_vnlru_free) count = max_vnlru_free; for (; count > 0; count--) { vp = TAILQ_FIRST(&vnode_free_list); /* * The list can be modified while the free_list_mtx * has been dropped and vp could be NULL here. */ if (vp == NULL) { if (tried_batches) break; mtx_unlock(&vnode_free_list_mtx); vnlru_return_batches(mnt_op); tried_batches = true; mtx_lock(&vnode_free_list_mtx); continue; } VNASSERT(vp->v_op != NULL, vp, ("vnlru_free: vnode already reclaimed.")); KASSERT((vp->v_iflag & VI_FREE) != 0, ("Removing vnode not on freelist")); KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Mangling active vnode")); TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); /* * Don't recycle if our vnode is from different type * of mount point. Note that mp is type-safe, the * check does not reach unmapped address even if * vnode is reclaimed. * Don't recycle if we can't get the interlock without * blocking. */ if ((mnt_op != NULL && (mp = vp->v_mount) != NULL && mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); continue; } VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, vp, ("vp inconsistent on freelist")); /* * The clear of VI_FREE prevents activation of the * vnode. There is no sense in putting the vnode on * the mount point active list, only to remove it * later during recycling. Inline the relevant part * of vholdl(), to avoid triggering assertions or * activating. */ freevnodes--; vp->v_iflag &= ~VI_FREE; VNODE_REFCOUNT_FENCE_REL(); refcount_acquire(&vp->v_holdcnt); mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); vtryrecycle(vp); /* * If the recycled succeeded this vdrop will actually free * the vnode. If not it will simply place it back on * the free list. */ vdrop(vp); mtx_lock(&vnode_free_list_mtx); } } void vnlru_free(int count, struct vfsops *mnt_op) { mtx_lock(&vnode_free_list_mtx); vnlru_free_locked(count, mnt_op); mtx_unlock(&vnode_free_list_mtx); } /* XXX some names and initialization are bad for limits and watermarks. */ static int vspace(void) { int space; gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ vlowat = vhiwat / 2; if (numvnodes > desiredvnodes) return (0); space = desiredvnodes - numvnodes; if (freevnodes > wantfreevnodes) space += freevnodes - wantfreevnodes; return (space); } static void vnlru_return_batch_locked(struct mount *mp) { struct vnode *vp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); if (mp->mnt_tmpfreevnodelistsize == 0) return; TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) { VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp, ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist")); vp->v_mflag &= ~VMP_TMPMNTFREELIST; } mtx_lock(&vnode_free_list_mtx); TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist); freevnodes += mp->mnt_tmpfreevnodelistsize; mtx_unlock(&vnode_free_list_mtx); mp->mnt_tmpfreevnodelistsize = 0; } static void vnlru_return_batch(struct mount *mp) { mtx_lock(&mp->mnt_listmtx); vnlru_return_batch_locked(mp); mtx_unlock(&mp->mnt_listmtx); } static void vnlru_return_batches(struct vfsops *mnt_op) { struct mount *mp, *nmp; bool need_unbusy; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { need_unbusy = false; if (mnt_op != NULL && mp->mnt_op != mnt_op) goto next; if (mp->mnt_tmpfreevnodelistsize == 0) goto next; if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) { vnlru_return_batch(mp); need_unbusy = true; mtx_lock(&mountlist_mtx); } next: nmp = TAILQ_NEXT(mp, mnt_list); if (need_unbusy) vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; static void vnlru_proc(void) { struct mount *mp, *nmp; unsigned long onumvnodes; int done, force, trigger, usevnodes; bool reclaim_nc_src; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); force = 0; for (;;) { kproc_suspend_check(vnlruproc); mtx_lock(&vnode_free_list_mtx); /* * If numvnodes is too large (due to desiredvnodes being * adjusted using its sysctl, or emergency growth), first * try to reduce it by discarding from the free list. */ if (numvnodes > desiredvnodes) vnlru_free_locked(numvnodes - desiredvnodes, NULL); /* * Sleep if the vnode cache is in a good state. This is * when it is not over-full and has space for about a 4% * or 9% expansion (by growing its size or inexcessively * reducing its free list). Otherwise, try to reclaim * space for a 10% expansion. */ if (vstir && force == 0) { force = 1; vstir = 0; } if (vspace() >= vlowat && force == 0) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } mtx_unlock(&vnode_free_list_mtx); done = 0; onumvnodes = numvnodes; /* * Calculate parameters for recycling. These are the same * throughout the loop to give some semblance of fairness. * The trigger point is to avoid recycling vnodes with lots * of resident pages. We aren't trying to free memory; we * are trying to recycle or at least free vnodes. */ if (numvnodes <= desiredvnodes) usevnodes = numvnodes - freevnodes; else usevnodes = numvnodes; if (usevnodes <= 0) usevnodes = 1; /* * The trigger value is is chosen to give a conservatively * large value to ensure that it alone doesn't prevent * making progress. The value can easily be so large that * it is effectively infinite in some congested and * misconfigured cases, and this is necessary. Normally * it is about 8 to 100 (pages), which is quite large. */ trigger = vm_cnt.v_page_count * 2 / usevnodes; if (force < 2) trigger = vsmalltrigger; reclaim_nc_src = force >= 3; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } done += vlrureclaim(mp, reclaim_nc_src, trigger); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) uma_reclaim(); if (done == 0) { if (force == 0 || force == 1) { force = 2; continue; } if (force == 2) { force = 3; continue; } force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else kern_yield(PRI_USER); /* * After becoming active to expand above low water, keep * active until above high water. */ force = vspace() < vhiwat; } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp, 0); VI_UNLOCK(vp); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if ((vp->v_iflag & VI_DOOMED) == 0) { counter_u64_add(recycles_count, 1); vgonel(vp); } VOP_UNLOCK(vp, 0); VI_UNLOCK(vp); vn_finished_write(vnmp); return (0); } static void vcheckspace(void) { if (vspace() < vlowat && vnlruproc_sig == 0) { vnlruproc_sig = 1; wakeup(vnlruproc); } } /* * Wait if necessary for space for a new vnode. */ static int getnewvnode_wait(int suspended) { mtx_assert(&vnode_free_list_mtx, MA_OWNED); if (numvnodes >= desiredvnodes) { if (suspended) { /* * The file system is being suspended. We cannot * risk a deadlock here, so allow allocation of * another vnode even if this would give too many. */ return (0); } if (vnlruproc_sig == 0) { vnlruproc_sig = 1; /* avoid unnecessary wakeups */ wakeup(vnlruproc); } msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); } /* Post-adjust like the pre-adjust in getnewvnode(). */ if (numvnodes + 1 > desiredvnodes && freevnodes > 1) vnlru_free_locked(1, NULL); return (numvnodes >= desiredvnodes ? ENFILE : 0); } /* * This hack is fragile, and probably not needed any more now that the * watermark handling works. */ void getnewvnode_reserve(u_int count) { struct thread *td; /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ /* XXX no longer so quick, but this part is not racy. */ mtx_lock(&vnode_free_list_mtx); if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes) vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes, freevnodes - wantfreevnodes), NULL); mtx_unlock(&vnode_free_list_mtx); td = curthread; /* First try to be quick and racy. */ if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { td->td_vp_reserv += count; vcheckspace(); /* XXX no longer so quick, but more racy */ return; } else atomic_subtract_long(&numvnodes, count); mtx_lock(&vnode_free_list_mtx); while (count > 0) { if (getnewvnode_wait(0) == 0) { count--; td->td_vp_reserv++; atomic_add_long(&numvnodes, 1); } } vcheckspace(); mtx_unlock(&vnode_free_list_mtx); } /* * This hack is fragile, especially if desiredvnodes or wantvnodes are * misconfgured or changed significantly. Reducing desiredvnodes below * the reserved amount should cause bizarre behaviour like reducing it * below the number of active vnodes -- the system will try to reduce * numvnodes to match, but should fail, so the subtraction below should * not overflow. */ void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; atomic_subtract_long(&numvnodes, td->td_vp_reserv); td->td_vp_reserv = 0; } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct thread *td; struct lock_object *lo; static int cyclecount; int error __unused; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); vp = NULL; td = curthread; if (td->td_vp_reserv > 0) { td->td_vp_reserv -= 1; goto alloc; } mtx_lock(&vnode_free_list_mtx); if (numvnodes < desiredvnodes) cyclecount = 0; else if (cyclecount++ >= freevnodes) { cyclecount = 0; vstir = 1; } /* * Grow the vnode cache if it will not be above its target max * after growing. Otherwise, if the free list is nonempty, try * to reclaim 1 item from it before growing the cache (possibly * above its target max if the reclamation failed or is delayed). * Otherwise, wait for some space. In all cases, schedule * vnlru_proc() if we are getting short of space. The watermarks * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ if (numvnodes + 1 <= desiredvnodes) ; else if (freevnodes > 0) vnlru_free_locked(1, NULL); else { error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ if (error != 0) { mtx_unlock(&vnode_free_list_mtx); return (error); } #endif } vcheckspace(); atomic_add_long(&numvnodes, 1); mtx_unlock(&vnode_free_list_mtx); alloc: counter_u64_add(vnodes_created, 1); vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); /* * Locks are given the generic name "vnode" when created. * Follow the historic practice of using the filesystem * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. * * Locks live in a witness group keyed on their name. Thus, * when a lock is renamed, it must also move from the witness * group of its old name to the witness group of its new name. * * The change only needs to be made when the vnode moves * from one filesystem type to another. We ensure that each * filesystem use a single static name pointer for its tag so * that we can compare pointers rather than doing a strcmp(). */ lo = &vp->v_vnlock->lock_object; if (lo->lo_name != tag) { lo->lo_name = tag; WITNESS_DESTROY(lo); WITNESS_INIT(lo, tag); } /* * By default, don't allow shared locks unless filesystems opt-in. */ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; /* * Finalize various vnode identity bits. */ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; v_init_counters(vp); vp->v_bufobj.bo_ops = &buf_ops_bio; #ifdef DIAGNOSTIC if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode(9), tag %s\n", tag); #endif #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); #endif if (mp != NULL) { vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; int active; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); VI_LOCK(vp); KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, ("Active vnode list size %d > Vnode list size %d", mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); active = vp->v_iflag & VI_ACTIVE; vp->v_iflag &= ~VI_ACTIVE; if (active) { mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Activating already active vnode")); vp->v_iflag |= VI_ACTIVE; mtx_lock(&mp->mnt_listmtx); TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize++; mtx_unlock(&mp->mnt_listmtx); VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { BO_UNLOCK(bo); vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); BO_LOCK(bo); } } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: flush dirty failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { /* * If we are flushing both V_NORMAL and V_ALT buffers then * do not skip any buffers. If we are flushing only V_NORMAL * buffers then skip buffers marked as BX_ALTDATA. If we are * flushing only V_ALT buffers then skip buffers not marked * as BX_ALTDATA. */ if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { continue; } if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp == NULL) break; nbp = gbincore(bo, lblkno); if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags) break; /* nbp invalid */ } return (retval); } int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) { struct buf *bp; int error; daddr_t lblkno; ASSERT_BO_LOCKED(bo); for (lblkno = startn;;) { again: bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); if (bp == NULL || bp->b_lblkno >= endn || bp->b_lblkno < startn) break; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); if (error != 0) { BO_RLOCK(bo); if (error == ENOLCK) goto again; return (error); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); lblkno = bp->b_lblkno + 1; if ((bp->b_flags & B_MANAGED) == 0) bremfree(bp); bp->b_flags |= B_RELBUF; /* * In the VMIO case, use the B_NOREUSE flag to hint that the * pages backing each buffer in the range are unlikely to be * reused. Dirty buffers will have the hint applied once * they've been written. */ if ((bp->b_flags & B_VMIO) != 0) bp->b_flags |= B_NOREUSE; brelse(bp); BO_RLOCK(bo); } return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, off_t length, int blksize) { struct buf *bp, *nbp; struct bufobj *bo; daddr_t startlbn; CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, vp, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ startlbn = howmany(length, blksize); ASSERT_VOP_LOCKED(vp, "vtruncbuf"); bo = &vp->v_bufobj; restart_unlocked: BO_LOCK(bo); while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) ; if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart_unlocked; VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * Invalidate the cached pages of a file's buffer within the range of block * numbers [startlbn, endlbn). */ void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize) { struct bufobj *bo; off_t start, end; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); start = blksize * startlbn; end = blksize * endlbn; bo = &vp->v_bufobj; BO_LOCK(bo); MPASS(blksize == bo->bo_bsize); while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) ; BO_UNLOCK(bo); vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); } static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn) { struct buf *bp, *nbp; bool anyfreed; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); ASSERT_BO_LOCKED(bo); do { anyfreed = false; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || nbp->b_vp != vp || (nbp->b_flags & B_DELWRI) != 0)) return (EAGAIN); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) return (EAGAIN); } } while (anyfreed); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != (BX_VNDIRTY|BX_VNCLEAN), ("buf_vlist_remove: Buf %p is on two lists", bp)); if (bp->b_xflags & BX_VNDIRTY) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Look up a buffer using the buffer tries. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("brelvp: Buffer %p not on queue.", bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = bo2vnode(*bo); if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining... "); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) { /* * Drop the sync mutex, because some watchdog * drivers need to sleep while patting */ mtx_unlock(&sync_mtx); wdog_kern_pat(WD_LASTVAL); mtx_lock(&sync_mtx); } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; ++reassignbufcalls; CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); /* * Delete from old vnode list, if on one. */ BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("reassignbuf: Buffer %p not on queue.", bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } static void v_init_counters(struct vnode *vp) { VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, vp, ("%s called for an initialized vnode", __FUNCTION__)); ASSERT_VI_UNLOCKED(vp, __FUNCTION__); refcount_init(&vp->v_holdcnt, 1); refcount_init(&vp->v_usecount, 1); } static void v_incr_usecount_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); if ((vp->v_iflag & VI_OWEINACT) != 0) { VNASSERT(vp->v_usecount == 0, vp, ("vnode with usecount and VI_OWEINACT set")); vp->v_iflag &= ~VI_OWEINACT; } refcount_acquire(&vp->v_usecount); v_incr_devcount(vp); } /* * Increment the use count on the vnode, taking care to reference * the driver's usecount if this is a chardev. */ static void v_incr_usecount(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (vp->v_type != VCHR && refcount_acquire_if_not_zero(&vp->v_usecount)) { VNODE_REFCOUNT_FENCE_ACQ(); VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, ("vnode with usecount and VI_OWEINACT set")); } else { VI_LOCK(vp); v_incr_usecount_locked(vp); VI_UNLOCK(vp); } } /* * Increment si_usecount of the associated device, if any. */ static void v_incr_devcount(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __FUNCTION__); if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } } /* * Decrement si_usecount of the associated device, if any. */ static void v_decr_devcount(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __FUNCTION__); if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VI_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. * * Notes on lockless counter manipulation: * _vhold, vputx and other routines make various decisions based * on either holdcnt or usecount being 0. As long as either counter * is not transitioning 0->1 nor 1->0, the manipulation can be done * with atomic operations. Otherwise the interlock is taken covering * both the atomic and additional actions. */ int vget(struct vnode *vp, int flags, struct thread *td) { int error, oweinact; VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vget: invalid lock operation")); if ((flags & LK_INTERLOCK) != 0) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); if ((flags & LK_VNHELD) != 0) VNASSERT((vp->v_holdcnt > 0), vp, ("vget: LK_VNHELD passed but vnode not held")); CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); if ((flags & LK_VNHELD) == 0) _vhold(vp, (flags & LK_INTERLOCK) != 0); if ((error = vn_lock(vp, flags)) != 0) { vdrop(vp); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) panic("vget: vn_lock failed to return ENOENT\n"); /* * We don't guarantee that any particular close will * trigger inactive processing so just make a best effort * here at preventing a reference to a removed file. If * we don't succeed no harm is done. * * Upgrade our holdcnt to a usecount. */ if (vp->v_type == VCHR || !refcount_acquire_if_not_zero(&vp->v_usecount)) { VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0) { oweinact = 0; } else { oweinact = 1; vp->v_iflag &= ~VI_OWEINACT; VNODE_REFCOUNT_FENCE_REL(); } refcount_acquire(&vp->v_usecount); v_incr_devcount(vp); if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE && (flags & LK_NOWAIT) == 0) vinactive(vp, td); VI_UNLOCK(vp); } return (0); } /* * Increase the reference (use) and hold count of a vnode. * This will also remove the vnode from the free list if it is presently free. */ void vref(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); _vhold(vp, false); v_incr_usecount(vp); } void vrefl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); _vhold(vp, true); v_incr_usecount_locked(vp); } void vrefact(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (__predict_false(vp->v_type == VCHR)) { VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp, ("%s: wrong ref counts", __func__)); vref(vp); return; } #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__)); old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old > 0, vp, ("%s: wrong use count", __func__)); #else refcount_acquire(&vp->v_holdcnt); refcount_acquire(&vp->v_usecount); #endif } /* * Return reference count of a vnode. * * The results of this call are only guaranteed when some mechanism is used to * stop other processes from gaining references to the vnode. This may be the * case if the caller holds the only reference. This is also useful when stale * data is acceptable as race conditions may be accounted for by some other * means. */ int vrefcnt(struct vnode *vp) { return (vp->v_usecount); } #define VPUTX_VRELE 1 #define VPUTX_VPUT 2 #define VPUTX_VUNREF 3 /* * Decrement the use and hold counts for a vnode. * * See an explanation near vget() as to why atomic operation is safe. */ static void vputx(struct vnode *vp, int func) { int error; KASSERT(vp != NULL, ("vputx: null vp")); if (func == VPUTX_VUNREF) ASSERT_VOP_LOCKED(vp, "vunref"); else if (func == VPUTX_VPUT) ASSERT_VOP_LOCKED(vp, "vput"); else KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (vp->v_type != VCHR && refcount_release_if_not_last(&vp->v_usecount)) { if (func == VPUTX_VPUT) VOP_UNLOCK(vp, 0); vdrop(vp); return; } VI_LOCK(vp); /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ if (!refcount_release(&vp->v_usecount) || (vp->v_iflag & VI_DOINGINACT)) { if (func == VPUTX_VPUT) VOP_UNLOCK(vp, 0); v_decr_devcount(vp); vdropl(vp); return; } v_decr_devcount(vp); error = 0; if (vp->v_usecount != 0) { vn_printf(vp, "vputx: usecount not zero for vnode "); panic("vputx: usecount not zero"); } CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); /* * Check if the fs wants to perform inactive processing. Note we * may be only holding the interlock, in which case it is possible * someone else called vgone on the vnode and ->v_data is now NULL. * Since vgone performs inactive on its own there is nothing to do * here but to drop our hold count. */ if (__predict_false(vp->v_iflag & VI_DOOMED) || VOP_NEED_INACTIVE(vp) == 0) { if (func == VPUTX_VPUT) VOP_UNLOCK(vp, 0); vdropl(vp); return; } /* * We must call VOP_INACTIVE with the node locked. Mark * as VI_DOINGINACT to avoid recursion. */ vp->v_iflag |= VI_OWEINACT; switch (func) { case VPUTX_VRELE: error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; case VPUTX_VPUT: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VPUTX_VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp, ("vnode with usecount and VI_OWEINACT set")); if (error == 0) { if (vp->v_iflag & VI_OWEINACT) vinactive(vp, curthread); if (func != VPUTX_VUNREF) VOP_UNLOCK(vp, 0); } vdropl(vp); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(struct vnode *vp) { vputx(vp, VPUTX_VRELE); } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() acquires the lock internally.) */ void vput(struct vnode *vp) { vputx(vp, VPUTX_VPUT); } /* * Release an exclusively locked vnode. Do not unlock the vnode lock. */ void vunref(struct vnode *vp) { vputx(vp, VPUTX_VUNREF); } /* * Increase the hold count and activate if this is the first reference. */ void _vhold(struct vnode *vp, bool locked) { struct mount *mp; if (locked) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (!locked) { if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) { VNODE_REFCOUNT_FENCE_ACQ(); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("_vhold: vnode with holdcnt is free")); return; } VI_LOCK(vp); } if ((vp->v_iflag & VI_FREE) == 0) { refcount_acquire(&vp->v_holdcnt); if (!locked) VI_UNLOCK(vp); return; } VNASSERT(vp->v_holdcnt == 0, vp, ("%s: wrong hold count", __func__)); VNASSERT(vp->v_op != NULL, vp, ("%s: vnode already reclaimed.", __func__)); /* * Remove a vnode from the free list, mark it as in use, * and put it on the active list. */ VNASSERT(vp->v_mount != NULL, vp, ("_vhold: vnode not on per mount vnode list")); mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) { TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); mp->mnt_tmpfreevnodelistsize--; vp->v_mflag &= ~VMP_TMPMNTFREELIST; } else { mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); freevnodes--; mtx_unlock(&vnode_free_list_mtx); } KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Activating already active vnode")); vp->v_iflag &= ~VI_FREE; vp->v_iflag |= VI_ACTIVE; TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize++; mtx_unlock(&mp->mnt_listmtx); refcount_acquire(&vp->v_holdcnt); if (!locked) VI_UNLOCK(vp); } void vholdnz(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VI_DOOMED) in which case we will free it. * * Because the vnode vm object keeps a hold reference on the vnode if * there is at least one resident non-cached page, the vnode cannot * leave the active list without the page cleanup done. */ void _vdrop(struct vnode *vp, bool locked) { struct bufobj *bo; struct mount *mp; int active; if (locked) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if ((int)vp->v_holdcnt <= 0) panic("vdrop: holdcnt %d", vp->v_holdcnt); if (!locked) { if (refcount_release_if_not_last(&vp->v_holdcnt)) return; VI_LOCK(vp); } if (refcount_release(&vp->v_holdcnt) == 0) { VI_UNLOCK(vp); return; } if ((vp->v_iflag & VI_DOOMED) == 0) { /* * Mark a vnode as free: remove it from its active list * and put it up for recycling on the freelist. */ VNASSERT(vp->v_op != NULL, vp, ("vdropl: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); VNASSERT(vp->v_holdcnt == 0, vp, ("vdropl: freeing when we shouldn't")); active = vp->v_iflag & VI_ACTIVE; if ((vp->v_iflag & VI_OWEINACT) == 0) { vp->v_iflag &= ~VI_ACTIVE; mp = vp->v_mount; if (mp != NULL) { mtx_lock(&mp->mnt_listmtx); if (active) { TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize--; } TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); mp->mnt_tmpfreevnodelistsize++; vp->v_iflag |= VI_FREE; vp->v_mflag |= VMP_TMPMNTFREELIST; VI_UNLOCK(vp); if (mp->mnt_tmpfreevnodelistsize >= mnt_free_list_batch) vnlru_return_batch_locked(mp); mtx_unlock(&mp->mnt_listmtx); } else { VNASSERT(active == 0, vp, ("vdropl: active vnode not on per mount " "vnode list")); mtx_lock(&vnode_free_list_mtx); TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); freevnodes++; vp->v_iflag |= VI_FREE; VI_UNLOCK(vp); mtx_unlock(&vnode_free_list_mtx); } } else { VI_UNLOCK(vp); counter_u64_add(free_owe_inact, 1); } return; } /* * The vnode has been marked for destruction, so free it. * * The vnode will be returned to the zone where it will * normally remain until it is needed for another vnode. We * need to cleanup (or verify that the cleanup has already * been done) any residual data left from its current use * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); atomic_subtract_long(&numvnodes, 1); bo = &vp->v_bufobj; VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("cleaned vnode still on the free list.")); VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, ("Dangling rangelock waiters")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { destroy_vpollinfo(vp->v_pollinfo); vp->v_pollinfo = NULL; } #ifdef INVARIANTS /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif vp->v_mountedhere = NULL; vp->v_unpcb = NULL; vp->v_rdev = NULL; vp->v_fifoinfo = NULL; vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; vp->v_iflag = 0; vp->v_vflag = 0; bo->bo_flag = 0; uma_zfree(vnode_zone, vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. * OWEINACT tracks whether a vnode missed a call to inactive due to a * failed lock upgrade. */ void vinactive(struct vnode *vp, struct thread *td) { struct vm_object *obj; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are converted into the vnode's dirty * buffers, since these will no longer be checked once the * vnode is on the inactive list. * * The write-out of the dirty pages is asynchronous. At the * point that VOP_INACTIVE() is called, there could still be * pending I/O and dirty pages in the object. */ if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_WUNLOCK(obj); } VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp, 0); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); if (error != 0) { VOP_UNLOCK(vp, 0); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount <= 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp, 0); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vn_printf(vp, "vflush: busy vnode "); #endif } VOP_UNLOCK(vp, 0); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp, 0); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; VI_LOCK(vp); recycled = vrecyclel(vp); VI_UNLOCK(vp); return (recycled); } /* * vrecycle, with the vp interlock held. */ int vrecyclel(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } static void notify_lowervp_vfs_dummy(struct mount *mp __unused, struct vnode *lowervp __unused) { } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, int event) { static struct vfsops vgonel_vfsops = { .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, }; struct mount *mp, *ump, *mmp; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); if (TAILQ_EMPTY(&mp->mnt_uppers)) goto unlock; MNT_IUNLOCK(mp); mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); mmp->mnt_op = &vgonel_vfsops; mmp->mnt_kern_flag |= MNTK_MARKER; MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_VGONE_UPPER; for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { ump = TAILQ_NEXT(ump, mnt_upper_link); continue; } TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump, vp); break; default: KASSERT(0, ("invalid event %d", event)); break; } MNT_ILOCK(mp); ump = TAILQ_NEXT(mmp, mnt_upper_link); TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); } free(mmp, M_TEMP); mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; wakeup(&mp->mnt_uppers); } unlock: MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ static void vgonel(struct vnode *vp) { struct thread *td; - int oweinact; - int active; struct mount *mp; + vm_object_t object; + bool active, oweinact; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_iflag & VI_DOOMED) return; vp->v_iflag |= VI_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ - active = vp->v_usecount; - oweinact = (vp->v_iflag & VI_OWEINACT); + active = vp->v_usecount > 0; + oweinact = (vp->v_iflag & VI_OWEINACT) != 0; VI_UNLOCK(vp); vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); if ((vp->v_iflag & VI_DOINGINACT) == 0) vinactive(vp, td); VI_UNLOCK(vp); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); /* - * For VMIO bufobj, BO_DEAD is set in vm_object_terminate() - * after the object's page queue is flushed. + * For VMIO bufobj, BO_DEAD is set later, or in + * vm_object_terminate() after the object's page queue is + * flushed. */ - if (vp->v_bufobj.bo_object == NULL) + object = vp->v_bufobj.bo_object; + if (object == NULL) vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); + /* + * Handle the VM part. Tmpfs handles v_object on its own (the + * OBJT_VNODE check). Nullfs or other bypassing filesystems + * should not touch the object borrowed from the lower vnode + * (the handle check). + */ + if (object != NULL && object->type == OBJT_VNODE && + object->handle == vp) + vnode_destroy_vobject(vp); + /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); /* * Clear the advisory locks and wake up waiting threads. */ (void)VOP_ADVLOCKPURGE(vp); vp->v_lockf = NULL; /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_tag = "none"; vp->v_type = VBAD; } /* * Calculate the total number of references to a special device. */ int vcount(struct vnode *vp) { int count; dev_lock(); count = vp->v_rdev->si_usecount; dev_unlock(); return (count); } /* * Same as above, but using the struct cdev *as argument */ int count_dev(struct cdev *dev) { int count; dev_lock(); count = dev->si_usecount; dev_unlock(); return(count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); printf(" usecount %d, writecount %d, refcount %d", vp->v_usecount, vp->v_writecount, vp->v_holdcnt); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); break; case VCHR: printf(" rdev %p\n", vp->v_rdev); break; case VSOCK: printf(" socket %p\n", vp->v_unpcb); break; case VFIFO: printf(" fifoinfo %p\n", vp->v_fifoinfo); break; default: printf("\n"); break; } buf[0] = '\0'; buf[1] = '\0'; if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_DOOMED) strlcat(buf, "|VI_DOOMED", sizeof(buf)); if (vp->v_iflag & VI_FREE) strlcat(buf, "|VI_FREE", sizeof(buf)); if (vp->v_iflag & VI_ACTIVE) strlcat(buf, "|VI_ACTIVE", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE | VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_clean.bv_cnt, vp->v_bufobj.bo_dirty.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vn_printf(vp, "vnode "); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_VGONE_UPPER); MNT_KERN_FLAG(MNTK_VGONE_WAITER); MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); MNT_KERN_FLAG(MNTK_MARKER); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); MNT_KERN_FLAG(MNTK_NOKNOTE); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d\n", mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_activevnodelistsize = %d\n", mp->mnt_activevnodelistsize); db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { if (vp->v_type != VMARKER) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif static void unmount_or_warn(struct mount *mp) { int error; error = dounmount(mp, MNT_FORCE, curthread); if (error != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp, *tmp; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); /* * Since this only runs when rebooting, it is not interlocked. */ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { vfs_ref(mp); /* * Forcibly unmounting "/dev" before "/" would prevent clean * unmount of the latter. */ if (mp == rootdevmp) continue; unmount_or_warn(mp); } if (rootdevmp != NULL) unmount_or_warn(rootdevmp); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); vnlru_return_batch(mp); MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { obj = vp->v_object; if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { if (!vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, curthread)) { if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(vp); continue; } obj = vp->v_object; if (obj != NULL) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); VM_OBJECT_WUNLOCK(obj); } vput(vp); } } else VI_UNLOCK(vp); } } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); uma_zfree(vnodepoll_zone, vi); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initialize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp, 0); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp); return (0); } save = curthread_pflags_set(TDP_SYNCIO); vfs_msync(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_finished_write(mp); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } int vn_need_pageq_flush(struct vnode *vp) { struct vm_object *obj; int need; MPASS(mtx_owned(VI_MTX(vp))); need = 0; if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) need = 1; return (need); } /* * Check if vnode represents a disk device */ int vn_isdisk(struct vnode *vp, int *errp) { int error; if (vp->v_type != VCHR) { error = ENOTBLK; goto out; } error = 0; dev_lock(); if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); out: if (errp != NULL) *errp = error; return (error == 0); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to suppress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ (vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 0, "Print vnode details on lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_vnode) vn_printf(vp, "vnode "); if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { int locked; if (!IGNORE_LOCK(vp)) { locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) vfs_badlock("is not locked but should be", str, vp); } } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } #ifdef DEBUG_VFS_LOCKS void vop_strategy_pre(void *ap) { struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (panicstr == NULL && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } } void vop_lock_pre(void *ap) { struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_lock_post(void *ap, int rc) { struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_unlock_pre(void *ap) { struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); } void vop_unlock_post(void *ap, int rc) { struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); } void vop_need_inactive_pre(void *ap) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } void vop_need_inactive_post(void *ap, int rc) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } #endif void vop_create_post(void *ap, int rc) { struct vop_create_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); } } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_reclaim_post(void *ap, int rc) { struct vop_reclaim_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; long hint; if (!rc) { hint = NOTE_WRITE; if (a->a_fdvp == a->a_tdvp) { if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } else { hint |= NOTE_EXTEND; if (a->a_fvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint &= ~NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_open_post(void *ap, int rc) { struct vop_open_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); } void vop_close_post(void *ap, int rc) { struct vop_close_args *a = ap; if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ (a->a_vp->v_iflag & VI_DOOMED) == 0)) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); } } void vop_read_post(void *ap, int rc) { struct vop_read_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } void vop_readdir_post(void *ap, int rc) { struct vop_readdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; static struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; static struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp, 0); } static void vfs_knl_assert_locked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); #endif } static void vfs_knl_assert_unlocked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; *ap->a_ncookies += 1; return (0); } /* * Mark for update the access time of the file if the filesystem * supports VOP_MARKATIME. This functionality is used by execve and * mmap, so we want to avoid the I/O implied by directly setting * va_atime for the sake of efficiency. */ void vfs_mark_atime(struct vnode *vp, struct ucred *cred) { struct mount *mp; mp = vp->v_mount; ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) (void)VOP_MARKATIME(vp); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; if (should_yield()) kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; vp = TAILQ_NEXT(vp, v_nmntvnodes)) { /* Allow a racy peek at VI_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0) continue; VI_LOCK(vp); if ((vp->v_iflag & VI_DOOMED) != 0) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); MNT_REF(mp); (*mvp)->v_mount = mp; (*mvp)->v_type = VMARKER; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { /* Allow a racy peek at VI_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0) continue; VI_LOCK(vp); if ((vp->v_iflag & VI_DOOMED) != 0) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); *mvp = NULL; return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h */ static void mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); *mvp = NULL; } /* * Relock the mp mount vnode list lock with the vp vnode interlock in the * conventional lock order during mnt_vnode_next_active iteration. * * On entry, the mount vnode list lock is held and the vnode interlock is not. * The list lock is dropped and reacquired. On success, both locks are held. * On failure, the mount vnode list lock is held but the vnode interlock is * not, and the procedure may have yielded. */ static bool mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp, struct vnode *vp) { const struct vnode *tmp; bool held, ret; VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp, ("%s: bad marker", __func__)); VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, ("%s: inappropriate vnode", __func__)); ASSERT_VI_UNLOCKED(vp, __func__); mtx_assert(&mp->mnt_listmtx, MA_OWNED); ret = false; TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist); TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist); /* * Use a hold to prevent vp from disappearing while the mount vnode * list lock is dropped and reacquired. Normally a hold would be * acquired with vhold(), but that might try to acquire the vnode * interlock, which would be a LOR with the mount vnode list lock. */ held = refcount_acquire_if_not_zero(&vp->v_holdcnt); mtx_unlock(&mp->mnt_listmtx); if (!held) goto abort; VI_LOCK(vp); if (!refcount_release_if_not_last(&vp->v_holdcnt)) { vdropl(vp); goto abort; } mtx_lock(&mp->mnt_listmtx); /* * Determine whether the vnode is still the next one after the marker, * excepting any other markers. If the vnode has not been doomed by * vgone() then the hold should have ensured that it remained on the * active list. If it has been doomed but is still on the active list, * don't abort, but rather skip over it (avoid spinning on doomed * vnodes). */ tmp = mvp; do { tmp = TAILQ_NEXT(tmp, v_actfreelist); } while (tmp != NULL && tmp->v_type == VMARKER); if (tmp != vp) { mtx_unlock(&mp->mnt_listmtx); VI_UNLOCK(vp); goto abort; } ret = true; goto out; abort: maybe_yield(); mtx_lock(&mp->mnt_listmtx); out: if (ret) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); mtx_assert(&mp->mnt_listmtx, MA_OWNED); return (ret); } static struct vnode * mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) { struct vnode *vp, *nvp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_actfreelist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_actfreelist); continue; } /* * Try-lock because this is the wrong lock order. If that does * not succeed, drop the mount vnode list lock and try to * reacquire it and the vnode interlock in the right order. */ if (!VI_TRYLOCK(vp) && !mnt_vnode_next_active_relock(*mvp, mp, vp)) goto restart; KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the active list %p %p", vp, mp)); if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) break; nvp = TAILQ_NEXT(vp, v_actfreelist); VI_UNLOCK(vp); vp = nvp; } TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); /* Check if we are done */ if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_active(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); mtx_unlock(&mp->mnt_listmtx); ASSERT_VI_LOCKED(vp, "active iter"); KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); return (vp); } struct vnode * __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); return (mnt_vnode_next_active(mvp, mp)); } struct vnode * __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); (*mvp)->v_type = VMARKER; (*mvp)->v_mount = mp; mtx_lock(&mp->mnt_listmtx); vp = TAILQ_FIRST(&mp->mnt_activevnodelist); if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_active(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); return (mnt_vnode_next_active(mvp, mp)); } void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_active(mvp, mp); } diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h index db75d3a2f66a..9a326a92eb33 100644 --- a/sys/ufs/ufs/ufs_extern.h +++ b/sys/ufs/ufs/ufs_extern.h @@ -1,130 +1,129 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_extern.h 8.10 (Berkeley) 5/14/95 * $FreeBSD$ */ #ifndef _UFS_UFS_EXTERN_H_ #define _UFS_UFS_EXTERN_H_ struct componentname; struct direct; struct indir; struct inode; struct mount; struct thread; struct sockaddr; struct ucred; struct ufid; struct vfsconf; struct vnode; struct vop_bmap_args; struct vop_cachedlookup_args; struct vop_generic_args; struct vop_inactive_args; struct vop_reclaim_args; extern struct vop_vector ufs_fifoops; extern struct vop_vector ufs_vnodeops; int ufs_bmap(struct vop_bmap_args *); int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *, struct buf *, int *, int *); int ufs_bmap_seekdata(struct vnode *, off_t *); int ufs_fhtovp(struct mount *, struct ufid *, int, struct vnode **); int ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *); void ufs_dirbad(struct inode *, doff_t, char *); int ufs_dirbadentry(struct vnode *, struct direct *, int); int ufs_dirempty(struct inode *, ino_t, struct ucred *); int ufs_extread(struct vop_read_args *); int ufs_extwrite(struct vop_write_args *); void ufs_makedirentry(struct inode *, struct componentname *, struct direct *); int ufs_direnter(struct vnode *, struct vnode *, struct direct *, struct componentname *, struct buf *, int); int ufs_dirremove(struct vnode *, struct inode *, int, int); int ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int); int ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *, ino_t *); int ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *); int ufs_inactive(struct vop_inactive_args *); int ufs_init(struct vfsconf *); void ufs_itimes(struct vnode *vp); int ufs_lookup(struct vop_cachedlookup_args *); -void ufs_prepare_reclaim(struct vnode *vp); int ufs_readdir(struct vop_readdir_args *); int ufs_reclaim(struct vop_reclaim_args *); void ffs_snapgone(struct inode *); vfs_root_t ufs_root; int ufs_uninit(struct vfsconf *); int ufs_vinit(struct mount *, struct vop_vector *, struct vnode **); #include SYSCTL_DECL(_vfs_ufs); /* * Soft update function prototypes. */ int softdep_setup_directory_add(struct buf *, struct inode *, off_t, ino_t, struct buf *, int); void softdep_change_directoryentry_offset(struct buf *, struct inode *, caddr_t, caddr_t, caddr_t, int); void softdep_setup_remove(struct buf *,struct inode *, struct inode *, int); void softdep_setup_directory_change(struct buf *, struct inode *, struct inode *, ino_t, int); void softdep_change_linkcnt(struct inode *); int softdep_slowdown(struct vnode *); void softdep_setup_create(struct inode *, struct inode *); void softdep_setup_dotdot_link(struct inode *, struct inode *); void softdep_setup_link(struct inode *, struct inode *); void softdep_setup_mkdir(struct inode *, struct inode *); void softdep_setup_rmdir(struct inode *, struct inode *); void softdep_setup_unlink(struct inode *, struct inode *); void softdep_revert_create(struct inode *, struct inode *); void softdep_revert_link(struct inode *, struct inode *); void softdep_revert_mkdir(struct inode *, struct inode *); void softdep_revert_rmdir(struct inode *, struct inode *); /* * Flags to low-level allocation routines. The low 16-bits are reserved * for IO_ flags from vnode.h. * * Note: The general vfs code typically limits the sequential heuristic * count to 127. See sequential_heuristic() in kern/vfs_vnops.c */ #define BA_CLRBUF 0x00010000 /* Clear invalid areas of buffer. */ #define BA_METAONLY 0x00020000 /* Return indirect block buffer. */ #define BA_UNMAPPED 0x00040000 /* Do not mmap resulted buffer. */ #define BA_SEQMASK 0x7F000000 /* Bits holding seq heuristic. */ #define BA_SEQSHIFT 24 #define BA_SEQMAX 0x7F #endif /* !_UFS_UFS_EXTERN_H_ */ diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c index 7ef5e6fb8ad8..8fd54b384d3b 100644 --- a/sys/ufs/ufs/ufs_inode.c +++ b/sys/ufs/ufs/ufs_inode.c @@ -1,239 +1,226 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_inode.c 8.9 (Berkeley) 5/14/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_ufs.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #include #endif #ifdef UFS_GJOURNAL #include #endif /* * Last reference to an inode. If necessary, write or delete it. */ int ufs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); mode_t mode; int error = 0; off_t isize; struct mount *mp; mp = NULL; /* * Ignore inodes related to stale file handles. */ if (ip->i_mode == 0) goto out; #ifdef UFS_GJOURNAL ufs_gjournal_close(vp); #endif #ifdef QUOTA /* * Before moving off the active list, we must be sure that * any modified quotas have been pushed since these will no * longer be checked once the vnode is on the inactive list. */ qsyncvp(vp); #endif if ((ip->i_effnlink == 0 && DOINGSOFTDEP(vp)) || (ip->i_nlink <= 0 && !UFS_RDONLY(ip))) { loop: if (vn_start_secondary_write(vp, &mp, V_NOWAIT) != 0) { /* Cannot delete file while file system is suspended */ if ((vp->v_iflag & VI_DOOMED) != 0) { /* Cannot return before file is deleted */ (void) vn_start_secondary_write(vp, &mp, V_WAIT); } else { MNT_ILOCK(mp); if ((mp->mnt_kern_flag & (MNTK_SUSPEND2 | MNTK_SUSPENDED)) == 0) { MNT_IUNLOCK(mp); goto loop; } /* * Fail to inactivate vnode now and * let ffs_snapshot() clean up after * it has resumed the file system. */ VI_LOCK(vp); vp->v_iflag |= VI_OWEINACT; VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } } } isize = ip->i_size; if (I_IS_UFS2(ip)) isize += ip->i_din2->di_extsize; if (ip->i_effnlink <= 0 && isize && !UFS_RDONLY(ip)) error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL, NOCRED); if (ip->i_nlink <= 0 && ip->i_mode && !UFS_RDONLY(ip)) { #ifdef QUOTA if (!getinoquota(ip)) (void)chkiq(ip, -1, NOCRED, FORCE); #endif #ifdef UFS_EXTATTR ufs_extattr_vnode_inactive(vp, ap->a_td); #endif /* * Setting the mode to zero needs to wait for the inode * to be written just as does a change to the link count. * So, rather than creating a new entry point to do the * same thing, we just use softdep_change_linkcnt(). */ DIP_SET(ip, i_rdev, 0); mode = ip->i_mode; ip->i_mode = 0; DIP_SET(ip, i_mode, 0); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); UFS_VFREE(vp, ip->i_number, mode); } if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) { if ((ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 && mp == NULL && vn_start_secondary_write(vp, &mp, V_NOWAIT)) { mp = NULL; ip->i_flag &= ~IN_ACCESS; } else { if (mp == NULL) (void) vn_start_secondary_write(vp, &mp, V_WAIT); UFS_UPDATE(vp, 0); } } out: /* * If we are done with the inode, reclaim it * so that it can be reused immediately. */ if (ip->i_mode == 0) vrecycle(vp); if (mp != NULL) vn_finished_secondary_write(mp); return (error); } -void -ufs_prepare_reclaim(struct vnode *vp) +/* + * Reclaim an inode so that it can be used for other purposes. + */ +int +ufs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; { - struct inode *ip; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); #ifdef QUOTA int i; -#endif - ip = VTOI(vp); - - vnode_destroy_vobject(vp); -#ifdef QUOTA for (i = 0; i < MAXQUOTAS; i++) { if (ip->i_dquot[i] != NODQUOT) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } #endif #ifdef UFS_DIRHASH if (ip->i_dirhash != NULL) ufsdirhash_free(ip); #endif -} - -/* - * Reclaim an inode so that it can be used for other purposes. - */ -int -ufs_reclaim(ap) - struct vop_reclaim_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct inode *ip = VTOI(vp); - - ufs_prepare_reclaim(vp); if (ip->i_flag & IN_LAZYMOD) ip->i_flag |= IN_MODIFIED; UFS_UPDATE(vp, 0); /* * Remove the inode from its hash chain. */ vfs_hash_remove(vp); /* * Lock the clearing of v_data so ffs_lock() can inspect it * prior to obtaining the lock. */ VI_LOCK(vp); vp->v_data = 0; VI_UNLOCK(vp); UFS_IFREE(ITOUMP(ip), ip); return (0); } diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 8625e1022ca4..3910e3d19528 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -1,351 +1,350 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.h 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Virtual memory object module definitions. */ #ifndef _VM_OBJECT_ #define _VM_OBJECT_ #include #include #include #include #include #include #include /* * Types defined: * * vm_object_t Virtual memory object. * * List of locks * (c) const until freed * (o) per-object lock * (f) free pages queue mutex * */ #ifndef VM_PAGE_HAVE_PGLIST TAILQ_HEAD(pglist, vm_page); #define VM_PAGE_HAVE_PGLIST #endif struct vm_object { struct rwlock lock; TAILQ_ENTRY(vm_object) object_list; /* list of all objects */ LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */ LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */ struct pglist memq; /* list of resident pages */ struct vm_radix rtree; /* root of the resident page radix trie*/ vm_pindex_t size; /* Object size */ struct domainset_ref domain; /* NUMA policy. */ int generation; /* generation ID */ int ref_count; /* How many refs?? */ int shadow_count; /* how many objects that this is a shadow for */ vm_memattr_t memattr; /* default memory attribute for pages */ objtype_t type; /* type of pager */ u_short flags; /* see below */ u_short pg_color; /* (c) color of first page in obj */ volatile u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ int resident_page_count; /* number of resident pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ void *handle; union { /* * VNode pager * * vnp_size - current size of file */ struct { off_t vnp_size; vm_ooffset_t writemappings; } vnp; /* * Device pager * * devp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) devp_pglist; struct cdev_pager_ops *ops; struct cdev *dev; } devp; /* * SG pager * * sgp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) sgp_pglist; } sgp; /* * Swap pager * * swp_tmpfs - back-pointer to the tmpfs vnode, * if any, which uses the vm object * as backing store. The handle * cannot be reused for linking, * because the vnode can be * reclaimed and recreated, making * the handle changed and hash-chain * invalid. * * swp_blks - pc-trie of the allocated swap blocks. * */ struct { void *swp_tmpfs; struct pctrie swp_blks; } swp; } un_pager; struct ucred *cred; vm_ooffset_t charge; void *umtx_data; }; /* * Flags */ #define OBJ_FICTITIOUS 0x0001 /* (c) contains fictitious pages */ #define OBJ_UNMANAGED 0x0002 /* (c) contains unmanaged pages */ #define OBJ_POPULATE 0x0004 /* pager implements populate() */ #define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ #define OBJ_NOSPLIT 0x0010 /* dont split this object */ #define OBJ_UMTXDEAD 0x0020 /* umtx pshared was terminated */ #define OBJ_PG_DTOR 0x0080 /* dont reset object, leave that for dtor */ #define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ #define OBJ_TMPFS_DIRTY 0x0400 /* dirty tmpfs obj */ #define OBJ_COLORED 0x1000 /* pg_color is defined */ #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ -#define OBJ_DISCONNECTWNT 0x4000 /* disconnect from vnode wanted */ #define OBJ_TMPFS 0x8000 /* has tmpfs vnode allocated */ /* * Helpers to perform conversion between vm_object page indexes and offsets. * IDX_TO_OFF() converts an index into an offset. * OFF_TO_IDX() converts an offset into an index. * OBJ_MAX_SIZE specifies the maximum page index corresponding to the * maximum unsigned offset. */ #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT)) #define OBJ_MAX_SIZE (OFF_TO_IDX(UINT64_MAX) + 1) #ifdef _KERNEL #define OBJPC_SYNC 0x1 /* sync I/O */ #define OBJPC_INVAL 0x2 /* invalidate */ #define OBJPC_NOSYNC 0x4 /* skip if VPO_NOSYNC */ /* * The following options are supported by vm_object_page_remove(). */ #define OBJPR_CLEANONLY 0x1 /* Don't remove dirty pages. */ #define OBJPR_NOTMAPPED 0x2 /* Don't unmap pages. */ TAILQ_HEAD(object_q, vm_object); extern struct object_q vm_object_list; /* list of allocated objects */ extern struct mtx vm_object_list_mtx; /* lock for object list and count */ extern struct vm_object kernel_object_store; /* kernel and kmem are aliased for backwards KPI compat. */ #define kernel_object (&kernel_object_store) #define kmem_object (&kernel_object_store) #define VM_OBJECT_ASSERT_LOCKED(object) \ rw_assert(&(object)->lock, RA_LOCKED) #define VM_OBJECT_ASSERT_RLOCKED(object) \ rw_assert(&(object)->lock, RA_RLOCKED) #define VM_OBJECT_ASSERT_WLOCKED(object) \ rw_assert(&(object)->lock, RA_WLOCKED) #define VM_OBJECT_ASSERT_UNLOCKED(object) \ rw_assert(&(object)->lock, RA_UNLOCKED) #define VM_OBJECT_LOCK_DOWNGRADE(object) \ rw_downgrade(&(object)->lock) #define VM_OBJECT_RLOCK(object) \ rw_rlock(&(object)->lock) #define VM_OBJECT_RUNLOCK(object) \ rw_runlock(&(object)->lock) #define VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo) \ rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo)) #define VM_OBJECT_TRYRLOCK(object) \ rw_try_rlock(&(object)->lock) #define VM_OBJECT_TRYWLOCK(object) \ rw_try_wlock(&(object)->lock) #define VM_OBJECT_TRYUPGRADE(object) \ rw_try_upgrade(&(object)->lock) #define VM_OBJECT_WLOCK(object) \ rw_wlock(&(object)->lock) #define VM_OBJECT_WOWNED(object) \ rw_wowned(&(object)->lock) #define VM_OBJECT_WUNLOCK(object) \ rw_wunlock(&(object)->lock) #define VM_OBJECT_DROP(object) \ lock_class_rw.lc_unlock(&(object)->lock.lock_object) #define VM_OBJECT_PICKUP(object, state) \ lock_class_rw.lc_lock(&(object)->lock.lock_object, (state)) struct vnode; /* * The object must be locked or thread private. */ static __inline void vm_object_set_flag(vm_object_t object, u_short bits) { object->flags |= bits; } /* * Conditionally set the object's color, which (1) enables the allocation * of physical memory reservations for anonymous objects and larger-than- * superpage-sized named objects and (2) determines the first page offset * within the object at which a reservation may be allocated. In other * words, the color determines the alignment of the object with respect * to the largest superpage boundary. When mapping named objects, like * files or POSIX shared memory objects, the color should be set to zero * before a virtual address is selected for the mapping. In contrast, * for anonymous objects, the color may be set after the virtual address * is selected. * * The object must be locked. */ static __inline void vm_object_color(vm_object_t object, u_short color) { if ((object->flags & OBJ_COLORED) == 0) { object->pg_color = color; object->flags |= OBJ_COLORED; } } static __inline bool vm_object_reserv(vm_object_t object) { if (object != NULL && (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) == OBJ_COLORED) { return (true); } return (false); } void vm_object_clear_flag(vm_object_t object, u_short bits); void vm_object_pip_add(vm_object_t object, short i); void vm_object_pip_wakeup(vm_object_t object); void vm_object_pip_wakeupn(vm_object_t object, short i); void vm_object_pip_wait(vm_object_t object, char *waitid); void vm_object_pip_wait_unlocked(vm_object_t object, char *waitid); void umtx_shm_object_init(vm_object_t object); void umtx_shm_object_terminated(vm_object_t object); extern int umtx_shm_vnobj_persistent; vm_object_t vm_object_allocate (objtype_t, vm_pindex_t); boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t, boolean_t); void vm_object_collapse (vm_object_t); void vm_object_deallocate (vm_object_t); void vm_object_destroy (vm_object_t); void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); int vm_object_kvme_type(vm_object_t object, struct vnode **vpp); void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int); boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options); boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t); void vm_object_print(long addr, boolean_t have_addr, long count, char *modif); void vm_object_reference (vm_object_t); void vm_object_reference_locked(vm_object_t); int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr); void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t); void vm_object_split(vm_map_entry_t); boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t, boolean_t); void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue); struct vnode *vm_object_vnode(vm_object_t object); #endif /* _KERNEL */ #endif /* _VM_OBJECT_ */ diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index ed286e0024cd..a12a4b80defd 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -1,1605 +1,1580 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993, 1994 John S. Dyson * Copyright (c) 1995, David Greenman * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 */ /* * Page to/from files (vnodes). */ /* * TODO: * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will * greatly re-simplify the vnode_pager. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run); static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m); static int vnode_pager_input_old(vm_object_t object, vm_page_t m); static void vnode_pager_dealloc(vm_object_t); static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, vop_getpages_iodone_t, void *); static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *cred); static int vnode_pager_generic_getpages_done(struct buf *); static void vnode_pager_generic_getpages_done_async(struct buf *); struct pagerops vnodepagerops = { .pgo_alloc = vnode_pager_alloc, .pgo_dealloc = vnode_pager_dealloc, .pgo_getpages = vnode_pager_getpages, .pgo_getpages_async = vnode_pager_getpages_async, .pgo_putpages = vnode_pager_putpages, .pgo_haspage = vnode_pager_haspage, }; static struct domainset *vnode_domainset = NULL; SYSCTL_PROC(_debug, OID_AUTO, vnode_domainset, CTLTYPE_STRING | CTLFLAG_RW, &vnode_domainset, 0, sysctl_handle_domainset, "A", "Default vnode NUMA policy"); static int nvnpbufs; SYSCTL_INT(_vm, OID_AUTO, vnode_pbufs, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &nvnpbufs, 0, "number of physical buffers allocated for vnode pager"); static uma_zone_t vnode_pbuf_zone; static void vnode_pager_init(void *dummy) { #ifdef __LP64__ nvnpbufs = nswbuf * 2; #else nvnpbufs = nswbuf / 2; #endif TUNABLE_INT_FETCH("vm.vnode_pbufs", &nvnpbufs); vnode_pbuf_zone = pbuf_zsecond_create("vnpbuf", nvnpbufs); } SYSINIT(vnode_pager, SI_SUB_CPU, SI_ORDER_ANY, vnode_pager_init, NULL); /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) { vm_object_t object; vm_ooffset_t size = isize; struct vattr va; if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) return (0); - while ((object = vp->v_object) != NULL) { - VM_OBJECT_WLOCK(object); - if (!(object->flags & OBJ_DEAD)) { - VM_OBJECT_WUNLOCK(object); - return (0); - } - VOP_UNLOCK(vp, 0); - vm_object_set_flag(object, OBJ_DISCONNECTWNT); - VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vodead", 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - } + object = vp->v_object; + if (object != NULL) + return (0); if (size == 0) { if (vn_isdisk(vp, NULL)) { size = IDX_TO_OFF(INT_MAX); } else { if (VOP_GETATTR(vp, &va, td->td_ucred)) return (0); size = va.va_size; } } object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred); /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. */ VM_OBJECT_WLOCK(object); object->ref_count--; VM_OBJECT_WUNLOCK(object); vrele(vp); KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object")); return (0); } void vnode_destroy_vobject(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; - if (obj == NULL) + if (obj == NULL || obj->handle != vp) return; ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject"); VM_OBJECT_WLOCK(obj); + MPASS(obj->type == OBJT_VNODE); umtx_shm_object_terminated(obj); if (obj->ref_count == 0) { /* * don't double-terminate the object */ if ((obj->flags & OBJ_DEAD) == 0) { vm_object_set_flag(obj, OBJ_DEAD); /* * Clean pages and flush buffers. */ vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(obj); vinvalbuf(vp, V_SAVE, 0, 0); BO_LOCK(&vp->v_bufobj); vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); VM_OBJECT_WLOCK(obj); vm_object_terminate(obj); } else { /* * Waiters were already handled during object * termination. The exclusive vnode lock hopefully * prevented new waiters from referencing the dying * object. */ - KASSERT((obj->flags & OBJ_DISCONNECTWNT) == 0, - ("OBJ_DISCONNECTWNT set obj %p flags %x", - obj, obj->flags)); vp->v_object = NULL; VM_OBJECT_WUNLOCK(obj); } } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); VM_OBJECT_WUNLOCK(obj); } KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object)); } /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. - * - * MPSAFE */ vm_object_t vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; struct vnode *vp; /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); - vp = (struct vnode *) handle; - - /* - * If the object is being terminated, wait for it to - * go away. - */ -retry: - while ((object = vp->v_object) != NULL) { - VM_OBJECT_WLOCK(object); - if ((object->flags & OBJ_DEAD) == 0) - break; - vm_object_set_flag(object, OBJ_DISCONNECTWNT); - VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vadead", 0); - } - + vp = (struct vnode *)handle; + ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc"); KASSERT(vp->v_usecount != 0, ("vnode_pager_alloc: no vnode reference")); +retry: + object = vp->v_object; if (object == NULL) { /* * Add an object of the appropriate size */ - object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); + object = vm_object_allocate(OBJT_VNODE, + OFF_TO_IDX(round_page(size))); object->un_pager.vnp.vnp_size = size; object->un_pager.vnp.writemappings = 0; object->domain.dr_policy = vnode_domainset; object->handle = handle; VI_LOCK(vp); if (vp->v_object != NULL) { /* - * Object has been created while we were sleeping + * Object has been created while we were allocating. */ VI_UNLOCK(vp); VM_OBJECT_WLOCK(object); KASSERT(object->ref_count == 1, ("leaked ref %p %d", object, object->ref_count)); object->type = OBJT_DEAD; object->ref_count = 0; VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); goto retry; } vp->v_object = object; VI_UNLOCK(vp); } else { + VM_OBJECT_WLOCK(object); object->ref_count++; #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif VM_OBJECT_WUNLOCK(object); } vrefact(vp); return (object); } /* * The object must be locked. */ static void vnode_pager_dealloc(vm_object_t object) { struct vnode *vp; int refs; vp = object->handle; if (vp == NULL) panic("vnode_pager_dealloc: pager already dealloced"); VM_OBJECT_ASSERT_WLOCKED(object); vm_object_pip_wait(object, "vnpdea"); refs = object->ref_count; object->handle = NULL; object->type = OBJT_DEAD; - if (object->flags & OBJ_DISCONNECTWNT) { - vm_object_clear_flag(object, OBJ_DISCONNECTWNT); - wakeup(object); - } ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); if (object->un_pager.vnp.writemappings > 0) { object->un_pager.vnp.writemappings = 0; VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } vp->v_object = NULL; VI_LOCK(vp); /* * vm_map_entry_set_vnode_text() cannot reach this vnode by * following object->handle. Clear all text references now. * This also clears the transient references from * kern_execve(), which is fine because dead_vnodeops uses nop * for VOP_UNSET_TEXT(). */ if (vp->v_writecount < 0) vp->v_writecount = 0; VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(object); while (refs-- > 0) vunref(vp); VM_OBJECT_WLOCK(object); } static boolean_t vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { struct vnode *vp = object->handle; daddr_t bn; uintptr_t lockstate; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; VM_OBJECT_ASSERT_LOCKED(object); /* * If no vp or vp is doomed or marked transparent to VM, we do not * have the page. */ if (vp == NULL || vp->v_iflag & VI_DOOMED) return FALSE; /* * If the offset is beyond end of file we do * not have the page. */ if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } lockstate = VM_OBJECT_DROP(object); err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before); VM_OBJECT_PICKUP(object, lockstate); if (err) return TRUE; if (bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { /* * The BMAP vop can report a partial block in the * 'after', but must not report blocks after EOF. * Assert the latter, and truncate 'after' in case * of the former. */ KASSERT((reqblock + *after) * pagesperblock < roundup2(object->size, pagesperblock), ("%s: reqblock %jd after %d size %ju", __func__, (intmax_t )reqblock, *after, (uintmax_t )object->size)); *after *= pagesperblock; *after += pagesperblock - (poff + 1); if (pindex + *after >= object->size) *after = object->size - 1 - pindex; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; } /* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) { vm_object_t object; vm_page_t m; vm_pindex_t nobjsize; if ((object = vp->v_object) == NULL) return; /* ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); */ VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); return; } KASSERT(object->type == OBJT_VNODE, ("not vnode-backed object %p", object)); if (nsize == object->un_pager.vnp.vnp_size) { /* * Hasn't changed size */ VM_OBJECT_WUNLOCK(object); return; } nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); if (nsize < object->un_pager.vnp.vnp_size) { /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode. * * XXX for some reason (I don't know yet), if we take a * completely invalid page and mark it partially valid * it can screw up NFS reads, so we don't allow the case. */ if ((nsize & PAGE_MASK) && (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL && m->valid != 0) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; /* * Clear out partial-page garbage in case * the page has been mapped. */ pmap_zero_page_area(m, base, size); /* * Update the valid bits to reflect the blocks that * have been zeroed. Some of these valid bits may * have already been set. */ vm_page_set_valid_range(m, base, size); /* * Round "base" to the next block boundary so that the * dirty bit for a partially zeroed block is not * cleared. */ base = roundup2(base, DEV_BSIZE); /* * Clear out partial-page dirty bits. * * note that we do not clear out the valid * bits. This would prevent bogus_page * replacement from working properly. */ vm_page_clear_dirty(m, base, PAGE_SIZE - base); } } object->un_pager.vnp.vnp_size = nsize; object->size = nobjsize; VM_OBJECT_WUNLOCK(object); } /* * calculate the linear (byte) disk address of specified virtual * file address */ static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run) { int bsize; int err; daddr_t vblock; daddr_t voffset; if (address < 0) return -1; if (vp->v_iflag & VI_DOOMED) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL); if (err == 0) { if (*rtaddress != -1) *rtaddress += voffset / DEV_BSIZE; if (run) { *run += 1; *run *= bsize / PAGE_SIZE; *run -= voffset / PAGE_SIZE; } } return (err); } /* * small block filesystem vnode pager input */ static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) { struct vnode *vp; struct bufobj *bo; struct buf *bp; struct sf_buf *sf; daddr_t fileaddr; vm_offset_t bsize; vm_page_bits_t bits; int error, i; error = 0; vp = object->handle; if (vp->v_iflag & VI_DOOMED) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &bo, 0, NULL, NULL); sf = sf_buf_alloc(m, 0); for (i = 0; i < PAGE_SIZE / bsize; i++) { vm_ooffset_t address; bits = vm_page_bits(i * bsize, bsize); if (m->valid & bits) continue; address = IDX_TO_OFF(m->pindex) + i * bsize; if (address >= object->un_pager.vnp.vnp_size) { fileaddr = -1; } else { error = vnode_pager_addr(vp, address, &fileaddr, NULL); if (error) break; } if (fileaddr != -1) { bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize; bp->b_blkno = fileaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bsize; bp->b_bufsize = bsize; bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnsrd"); if ((bp->b_ioflags & BIO_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); if (error) break; } else bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize); KASSERT((m->dirty & bits) == 0, ("vnode_pager_input_smlfs: page %p is dirty", m)); VM_OBJECT_WLOCK(object); m->valid |= bits; VM_OBJECT_WUNLOCK(object); } sf_buf_free(sf); if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; } /* * old style vnode pager input routine */ static int vnode_pager_input_old(vm_object_t object, vm_page_t m) { struct uio auio; struct iovec aiov; int error; int size; struct sf_buf *sf; struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; /* * Return failure if beyond current EOF */ if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex); vp = object->handle; VM_OBJECT_WUNLOCK(object); /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ sf = sf_buf_alloc(m, 0); aiov.iov_base = (caddr_t)sf_buf_kva(sf); aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = IDX_TO_OFF(m->pindex); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_td = curthread; error = VOP_READ(vp, &auio, 0, curthread->td_ucred); if (!error) { int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t)sf_buf_kva(sf) + count, PAGE_SIZE - count); } sf_buf_free(sf); VM_OBJECT_WLOCK(object); } KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m)); if (!error) m->valid = VM_PAGE_BITS_ALL; return error ? VM_PAGER_ERROR : VM_PAGER_OK; } /* * generic vnode pager input routine */ /* * Local media VFS's that do not implement their own VOP_GETPAGES * should have their VOP_GETPAGES call to vnode_pager_generic_getpages() * to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_GETPAGES. */ static int vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { struct vnode *vp; int rtval; vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages not implemented\n")); VM_OBJECT_WLOCK(object); return rtval; } static int vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg) { struct vnode *vp; int rtval; vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages_async not implemented\n")); VM_OBJECT_WLOCK(object); return (rtval); } /* * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for * local filesystems, where partially valid pages can only occur at * the end of file. */ int vnode_pager_local_getpages(struct vop_getpages_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); } int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg)); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; off_t foff; #ifdef INVARIANTS off_t blkno0; #endif int bsize, pagesperblock; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("%s does not support devices", __func__)); if (vp->v_iflag & VI_DOOMED) return (VM_PAGER_BAD); object = vp->v_object; foff = IDX_TO_OFF(m[0]->pindex); bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; KASSERT(foff < object->un_pager.vnp.vnp_size, ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); KASSERT(count <= nitems(bp->b_pages), ("%s: requested %d pages", __func__, count)); /* * The last page has valid blocks. Invalid part can only * exist at the end of file, and the page is made fully valid * by zeroing in vm_pager_get_pages(). */ if (m[count - 1]->valid != 0 && --count == 0) { if (iodone != NULL) iodone(arg, m, 1, 0); return (VM_PAGER_OK); } bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { uma_zfree(vnode_pbuf_zone, bp); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); VM_CNT_INC(v_vnodepgsin); error = vnode_pager_input_old(object, m[i]); if (error) break; } VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { uma_zfree(vnode_pbuf_zone, bp); return (VM_PAGER_ERROR); } /* * If the file system supports BMAP, but blocksize is smaller * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { uma_zfree(vnode_pbuf_zone, bp); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); VM_CNT_INC(v_vnodepgsin); error = vnode_pager_input_smlfs(object, m[i]); if (error) break; } return (error); } /* * A sparse file can be encountered only for a single page request, * which may not be preceded by call to vm_pager_haspage(). */ if (bp->b_blkno == -1) { KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); uma_zfree(vnode_pbuf_zone, bp); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); VM_OBJECT_WLOCK(object); m[0]->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(object); return (VM_PAGER_OK); } #ifdef INVARIANTS blkno0 = bp->b_blkno; #endif bp->b_blkno += (foff % bsize) / DEV_BSIZE; /* Recalculate blocks available after/before to pages. */ poff = (foff % bsize) / PAGE_SIZE; before *= pagesperblock; before += poff; after *= pagesperblock; after += pagesperblock - (poff + 1); if (m[0]->pindex + after >= object->size) after = object->size - 1 - m[0]->pindex; KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d", __func__, count, after + 1)); after -= count - 1; /* Trim requested rbehind/rahead to possible values. */ rbehind = a_rbehind ? *a_rbehind : 0; rahead = a_rahead ? *a_rahead : 0; rbehind = min(rbehind, before); rbehind = min(rbehind, m[0]->pindex); rahead = min(rahead, after); rahead = min(rahead, object->size - m[count - 1]->pindex); /* * Check that total amount of pages fit into buf. Trim rbehind and * rahead evenly if not. */ if (rbehind + rahead + count > nitems(bp->b_pages)) { int trim, sum; trim = rbehind + rahead + count - nitems(bp->b_pages) + 1; sum = rbehind + rahead; if (rbehind == before) { /* Roundup rbehind trim to block size. */ rbehind -= roundup(trim * rbehind / sum, pagesperblock); if (rbehind < 0) rbehind = 0; } else rbehind -= trim * rbehind / sum; rahead -= trim * rahead / sum; } KASSERT(rbehind + rahead + count <= nitems(bp->b_pages), ("%s: behind %d ahead %d count %d", __func__, rbehind, rahead, count)); /* * Fill in the bp->b_pages[] array with requested and optional * read behind or read ahead pages. Read behind pages are looked * up in a backward direction, down to a first cached page. Same * for read ahead pages, but there is no need to shift the array * in case of encountering a cached page. */ i = bp->b_npages = 0; if (rbehind) { vm_pindex_t startpindex, tpindex; vm_page_t p; VM_OBJECT_WLOCK(object); startpindex = m[0]->pindex - rbehind; if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL && p->pindex >= startpindex) startpindex = p->pindex + 1; /* tpindex is unsigned; beware of numeric underflow. */ for (tpindex = m[0]->pindex - 1; tpindex >= startpindex && tpindex < m[0]->pindex; tpindex--, i++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) { /* Shift the array. */ for (int j = 0; j < i; j++) bp->b_pages[j] = bp->b_pages[j + tpindex + 1 - startpindex]; break; } bp->b_pages[tpindex - startpindex] = p; } bp->b_pgbefore = i; bp->b_npages += i; bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE; } else bp->b_pgbefore = 0; /* Requested pages. */ for (int j = 0; j < count; j++, i++) bp->b_pages[i] = m[j]; bp->b_npages += count; if (rahead) { vm_pindex_t endpindex, tpindex; vm_page_t p; if (!VM_OBJECT_WOWNED(object)) VM_OBJECT_WLOCK(object); endpindex = m[count - 1]->pindex + rahead + 1; if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL && p->pindex < endpindex) endpindex = p->pindex; if (endpindex > object->size) endpindex = object->size; for (tpindex = m[count - 1]->pindex + 1; tpindex < endpindex; i++, tpindex++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[i] = p; } bp->b_pgafter = i - bp->b_npages; bp->b_npages = i; } else bp->b_pgafter = 0; if (VM_OBJECT_WOWNED(object)) VM_OBJECT_WUNLOCK(object); /* Report back actual behind/ahead read. */ if (a_rbehind) *a_rbehind = bp->b_pgbefore; if (a_rahead) *a_rahead = bp->b_pgafter; #ifdef INVARIANTS KASSERT(bp->b_npages <= nitems(bp->b_pages), ("%s: buf %p overflowed", __func__, bp)); for (int j = 1, prev = 0; j < bp->b_npages; j++) { if (bp->b_pages[j] == bogus_page) continue; KASSERT(bp->b_pages[j]->pindex - bp->b_pages[prev]->pindex == j - prev, ("%s: pages array not consecutive, bp %p", __func__, bp)); prev = j; } #endif /* * Recalculate first offset and bytecount with regards to read behind. * Truncate bytecount to vnode real size and round up physical size * for real devices. */ foff = IDX_TO_OFF(bp->b_pages[0]->pindex); bytecount = bp->b_npages << PAGE_SHIFT; if ((foff + bytecount) > object->un_pager.vnp.vnp_size) bytecount = object->un_pager.vnp.vnp_size - foff; secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("%s: sector size %d too large", __func__, secmask + 1)); bytecount = (bytecount + secmask) & ~secmask; /* * And map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount; bp->b_iooffset = dbtob(bp->b_blkno); KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) == (blkno0 - bp->b_blkno) * DEV_BSIZE + IDX_TO_OFF(m[0]->pindex) % bsize, ("wrong offsets bsize %d m[0] %ju b_pages[0] %ju " "blkno0 %ju b_blkno %ju", bsize, (uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex, (uintmax_t)blkno0, (uintmax_t)bp->b_blkno)); atomic_add_long(&runningbufspace, bp->b_runningbufspace); VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, bp->b_npages); if (iodone != NULL) { /* async */ bp->b_pgiodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); return (VM_PAGER_OK); } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } } static void vnode_pager_generic_getpages_done_async(struct buf *bp) { int error; error = vnode_pager_generic_getpages_done(bp); /* Run the iodone upon the requested range. */ bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore, bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error); for (int i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); } static int vnode_pager_generic_getpages_done(struct buf *bp) { vm_object_t object; off_t tfoff, nextoff; int i, error; error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0; object = bp->b_vp->v_object; if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) { if (!buf_mapped(bp)) { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } bzero(bp->b_data + bp->b_bcount, PAGE_SIZE * bp->b_npages - bp->b_bcount); } if (buf_mapped(bp)) { pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); bp->b_data = unmapped_buf; } VM_OBJECT_WLOCK(object); for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex); i < bp->b_npages; i++, tfoff = nextoff) { vm_page_t mt; nextoff = tfoff + PAGE_SIZE; mt = bp->b_pages[i]; if (nextoff <= object->un_pager.vnp.vnp_size) { /* * Read filled up entire page. */ mt->valid = VM_PAGE_BITS_ALL; KASSERT(mt->dirty == 0, ("%s: page %p is dirty", __func__, mt)); KASSERT(!pmap_page_is_mapped(mt), ("%s: page %p is mapped", __func__, mt)); } else { /* * Read did not fill up entire page. * * Currently we do not set the entire page valid, * we just try to clear the piece that we couldn't * read. */ vm_page_set_valid_range(mt, 0, object->un_pager.vnp.vnp_size - tfoff); KASSERT((mt->dirty & vm_page_bits(0, object->un_pager.vnp.vnp_size - tfoff)) == 0, ("%s: page %p is dirty", __func__, mt)); } if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(mt); } VM_OBJECT_WUNLOCK(object); if (error != 0) printf("%s: I/O read error %d\n", __func__, error); return (error); } /* * EOPNOTSUPP is no longer legal. For local media VFS's that do not * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to * vnode_pager_generic_putpages() to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_PUTPAGES. */ static void vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { int rtval; struct vnode *vp; int bytes = count * PAGE_SIZE; /* * Force synchronous operation if we are extremely low on memory * to prevent a low-memory deadlock. VOP operations often need to * allocate more memory to initiate the I/O ( i.e. do a BMAP * operation ). The swapper handles the case by limiting the amount * of asynchronous I/O, but that sort of solution doesn't scale well * for the vnode pager without a lot of work. * * Also, the backing vnode's iodone routine may not wake the pageout * daemon up. This should be probably be addressed XXX. */ if (vm_page_count_min()) flags |= VM_PAGER_PUT_SYNC; /* * Call device-specific putpages function */ vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: stale FS putpages\n")); VM_OBJECT_WLOCK(object); } static int vn_off2bidx(vm_ooffset_t offset) { return ((offset & PAGE_MASK) / DEV_BSIZE); } static bool vn_dirty_blk(vm_page_t m, vm_ooffset_t offset) { KASSERT(IDX_TO_OFF(m->pindex) <= offset && offset < IDX_TO_OFF(m->pindex + 1), ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex, (uintmax_t)offset)); return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. * * This is typically called indirectly via the pageout daemon and * clustering has already typically occurred, so in general we ask the * underlying filesystem to write the data out asynchronously rather * then delayed. */ int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount, int flags, int *rtvals) { vm_object_t object; vm_page_t m; vm_ooffset_t maxblksz, next_offset, poffset, prev_offset; struct uio auio; struct iovec aiov; off_t prev_resid, wrsz; int count, error, i, maxsize, ncount, pgoff, ppscheck; bool in_hole; static struct timeval lastfail; static int curfail; object = vp->v_object; count = bytecount / PAGE_SIZE; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_ERROR; if ((int64_t)ma[0]->pindex < 0) { printf("vnode_pager_generic_putpages: " "attempt to write meta-data 0x%jx(%lx)\n", (uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty); rtvals[0] = VM_PAGER_BAD; return (VM_PAGER_BAD); } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(ma[0]->pindex); /* * If the page-aligned write is larger then the actual file we * have to invalidate pages occurring beyond the file EOF. However, * there is an edge case where a file may not be page-aligned where * the last page is partially invalid. In this case the filesystem * may not properly clear the dirty bits for the entire page (which * could be VM_PAGE_BITS_ALL due to the page having been mmap()d). * With the page locked we are free to fix-up the dirty bits here. * * We do not under any circumstances truncate the valid bits, as * this will screw up bogus page replacement. */ VM_OBJECT_RLOCK(object); if (maxsize + poffset > object->un_pager.vnp.vnp_size) { if (!VM_OBJECT_TRYUPGRADE(object)) { VM_OBJECT_RUNLOCK(object); VM_OBJECT_WLOCK(object); if (maxsize + poffset <= object->un_pager.vnp.vnp_size) goto downgrade; } if (object->un_pager.vnp.vnp_size > poffset) { maxsize = object->un_pager.vnp.vnp_size - poffset; ncount = btoc(maxsize); if ((pgoff = (int)maxsize & PAGE_MASK) != 0) { pgoff = roundup2(pgoff, DEV_BSIZE); /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("vnode_pager_generic_putpages: page %p is not read-only", m)); MPASS(m->dirty != 0); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { maxsize = 0; ncount = 0; } for (i = ncount; i < count; i++) rtvals[i] = VM_PAGER_BAD; downgrade: VM_OBJECT_LOCK_DOWNGRADE(object); } auio.uio_iov = &aiov; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_td = NULL; maxblksz = roundup2(poffset + maxsize, DEV_BSIZE); for (prev_offset = poffset; prev_offset < maxblksz;) { /* Skip clean blocks. */ for (in_hole = true; in_hole && prev_offset < maxblksz;) { m = ma[OFF_TO_IDX(prev_offset - poffset)]; for (i = vn_off2bidx(prev_offset); i < sizeof(vm_page_bits_t) * NBBY && prev_offset < maxblksz; i++) { if (vn_dirty_blk(m, prev_offset)) { in_hole = false; break; } prev_offset += DEV_BSIZE; } } if (in_hole) goto write_done; /* Find longest run of dirty blocks. */ for (next_offset = prev_offset; next_offset < maxblksz;) { m = ma[OFF_TO_IDX(next_offset - poffset)]; for (i = vn_off2bidx(next_offset); i < sizeof(vm_page_bits_t) * NBBY && next_offset < maxblksz; i++) { if (!vn_dirty_blk(m, next_offset)) goto start_write; next_offset += DEV_BSIZE; } } start_write: if (next_offset > poffset + maxsize) next_offset = poffset + maxsize; /* * Getting here requires finding a dirty block in the * 'skip clean blocks' loop. */ MPASS(prev_offset < next_offset); VM_OBJECT_RUNLOCK(object); aiov.iov_base = NULL; auio.uio_iovcnt = 1; auio.uio_offset = prev_offset; prev_resid = auio.uio_resid = aiov.iov_len = next_offset - prev_offset; error = VOP_WRITE(vp, &auio, vnode_pager_putpages_ioflags(flags), curthread->td_ucred); wrsz = prev_resid - auio.uio_resid; if (wrsz == 0) { if (ppsratecheck(&lastfail, &curfail, 1) != 0) { vn_printf(vp, "vnode_pager_putpages: " "zero-length write at %ju resid %zd\n", auio.uio_offset, auio.uio_resid); } VM_OBJECT_RLOCK(object); break; } /* Adjust the starting offset for next iteration. */ prev_offset += wrsz; MPASS(auio.uio_offset == prev_offset); ppscheck = 0; if (error != 0 && (ppscheck = ppsratecheck(&lastfail, &curfail, 1)) != 0) vn_printf(vp, "vnode_pager_putpages: I/O error %d\n", error); if (auio.uio_resid != 0 && (ppscheck != 0 || ppsratecheck(&lastfail, &curfail, 1) != 0)) vn_printf(vp, "vnode_pager_putpages: residual I/O %zd " "at %ju\n", auio.uio_resid, (uintmax_t)ma[0]->pindex); VM_OBJECT_RLOCK(object); if (error != 0 || auio.uio_resid != 0) break; } write_done: /* Mark completely processed pages. */ for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++) rtvals[i] = VM_PAGER_OK; /* Mark partial EOF page. */ if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0) rtvals[i++] = VM_PAGER_OK; /* Unwritten pages in range, free bonus if the page is clean. */ for (; i < ncount; i++) rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR; VM_OBJECT_RUNLOCK(object); VM_CNT_ADD(v_vnodepgsout, i); VM_CNT_INC(v_vnodeout); return (rtvals[0]); } int vnode_pager_putpages_ioflags(int pager_flags) { int ioflags; /* * Pageouts are already clustered, use IO_ASYNC to force a * bawrite() rather then a bdwrite() to prevent paging I/O * from saturating the buffer cache. Dummy-up the sequential * heuristic to cause large ranges to cluster. If neither * IO_SYNC or IO_ASYNC is set, the system decides how to * cluster. */ ioflags = IO_VMIO; if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0) ioflags |= IO_SYNC; else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0) ioflags |= IO_ASYNC; ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0; ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0; ioflags |= IO_SEQMAX << IO_SEQSHIFT; return (ioflags); } /* * vnode_pager_undirty_pages(). * * A helper to mark pages as clean after pageout that was possibly * done with a short write. The lpos argument specifies the page run * length in bytes, and the written argument specifies how many bytes * were actually written. eof is the offset past the last valid byte * in the vnode using the absolute file position of the first byte in * the run as the base from which it is computed. */ void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, off_t eof, int lpos) { vm_object_t obj; int i, pos, pos_devb; if (written == 0 && eof >= lpos) return; obj = ma[0]->object; VM_OBJECT_WLOCK(obj); for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) { if (pos < trunc_page(written)) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(ma[i]); } else { /* Partially written page. */ rtvals[i] = VM_PAGER_AGAIN; vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK); } } if (eof >= lpos) /* avoid truncation */ goto done; for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) { if (pos != trunc_page(pos)) { /* * The page contains the last valid byte in * the vnode, mark the rest of the page as * clean, potentially making the whole page * clean. */ pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE); vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE - pos_devb); /* * If the page was cleaned, report the pageout * on it as successful. msync() no longer * needs to write out the page, endlessly * creating write requests and dirty buffers. */ if (ma[i]->dirty == 0) rtvals[i] = VM_PAGER_OK; pos = round_page(pos); } else { /* vm_pageout_flush() clears dirty */ rtvals[i] = VM_PAGER_BAD; pos += PAGE_SIZE; } } done: VM_OBJECT_WUNLOCK(obj); } void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; vm_ooffset_t old_wm; VM_OBJECT_WLOCK(object); if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } old_wm = object->un_pager.vnp.writemappings; object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start; vp = object->handle; if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) { ASSERT_VOP_LOCKED(vp, "v_writecount inc"); VOP_ADD_WRITECOUNT_CHECKED(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) { ASSERT_VOP_LOCKED(vp, "v_writecount dec"); VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } VM_OBJECT_WUNLOCK(object); } void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; struct mount *mp; vm_offset_t inc; VM_OBJECT_WLOCK(object); /* * First, recheck the object type to account for the race when * the vnode is reclaimed. */ if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } /* * Optimize for the case when writemappings is not going to * zero. */ inc = end - start; if (object->un_pager.vnp.writemappings != inc) { object->un_pager.vnp.writemappings -= inc; VM_OBJECT_WUNLOCK(object); return; } vp = object->handle; vhold(vp); VM_OBJECT_WUNLOCK(object); mp = NULL; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_SHARED | LK_RETRY); /* * Decrement the object's writemappings, by swapping the start * and end arguments for vnode_pager_update_writecount(). If * there was not a race with vnode reclaimation, then the * vnode's v_writecount is decremented. */ vnode_pager_update_writecount(object, end, start); VOP_UNLOCK(vp, 0); vdrop(vp); if (mp != NULL) vn_finished_write(mp); }