Index: sys/fs/nullfs/null_subr.c =================================================================== --- sys/fs/nullfs/null_subr.c +++ sys/fs/nullfs/null_subr.c @@ -258,6 +258,26 @@ if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp) vp->v_vflag |= VV_ROOT; + /* + * We might miss the case where lower vnode sets VIRF_PGREAD + * some time after construction, which is typical case. + * null_open rechecks. + */ + if ((lowervp->v_irflag & VIRF_PGREAD) != 0) { + MPASS(lowervp->v_object != NULL); + if ((vp->v_irflag & VIRF_PGREAD) == 0) { + if (vp->v_object == NULL) + vp->v_object = lowervp->v_object; + else + MPASS(vp->v_object == lowervp->v_object); + VI_LOCK(vp); + vp->v_irflag |= VIRF_PGREAD; + VI_UNLOCK(vp); + } else { + MPASS(vp->v_object != NULL); + } + } + /* * Atomically insert our new node into the hash or vget existing * if someone else has beaten us to it. Index: sys/fs/nullfs/null_vnops.c =================================================================== --- sys/fs/nullfs/null_vnops.c +++ sys/fs/nullfs/null_vnops.c @@ -439,8 +439,17 @@ vp = ap->a_vp; ldvp = NULLVPTOLOWERVP(vp); retval = null_bypass(&ap->a_gen); - if (retval == 0) + if (retval == 0) { vp->v_object = ldvp->v_object; + if ((ldvp->v_irflag & VIRF_PGREAD) != 0) { + MPASS(vp->v_object != NULL); + if ((vp->v_irflag & VIRF_PGREAD) == 0) { + VI_LOCK(vp); + vp->v_irflag |= VIRF_PGREAD; + VI_UNLOCK(vp); + } + } + } return (retval); } Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -4211,7 +4211,9 @@ buf[1] = '\0'; if (vp->v_irflag & VIRF_DOOMED) strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); - flags = vp->v_irflag & ~(VIRF_DOOMED); + if (vp->v_irflag & VIRF_PGREAD) + strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); + flags = vp->v_irflag & ~(VIRF_DOOMED | VIRF_PGREAD); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -127,11 +127,15 @@ static const int io_hold_cnt = 16; static int vn_io_fault_enable = 1; -SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, +SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN, &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); static int vn_io_fault_prefault = 0; -SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW, +SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN, &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); +static int vn_io_pgcache_read_enable = 1; +SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN, + &vn_io_pgcache_read_enable, 0, + "Enable copying from page cache for reads, avoiding fs"); static u_long vn_io_faults_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); @@ -844,6 +848,118 @@ return (ret); } +static int +vn_read_from_obj(struct vnode *vp, struct uio *uio) +{ + vm_object_t obj; + vm_page_t ma[io_hold_cnt + 2]; + off_t off, vsz; + ssize_t resid; + int error, i, j; + + obj = vp->v_object; + MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2)); + MPASS(obj != NULL); + MPASS(obj->type == VREG); + + /* + * Depends on type stability of vm_objects. + */ + vm_object_pip_add(obj, 1); + if ((obj->flags & OBJ_DEAD) != 0) { + /* + * Note that object might be already reused from the + * vnode, and the OBJ_DEAD flag cleared. This is fine, + * we recheck for DOOMED vnode state after all pages + * are busied, and retract then. + * + * But we check for OBJ_DEAD to ensure that we do not + * busy pages while vm_object_terminate_pages() + * processes the queue. + */ + error = EJUSTRETURN; + goto out_pip; + } + + resid = uio->uio_resid; + off = uio->uio_offset; + for (i = 0; resid > 0; i++) { + MPASS(i < io_hold_cnt + 2); + ma[i] = vm_page_grab_unlocked(obj, atop(off), + VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | + VM_ALLOC_NOWAIT); + if (ma[i] == NULL) + break; + + /* + * Skip invalid pages. Valid mask can be partial only + * at EOF, and we clip later. + */ + if (vm_page_none_valid(ma[i])) { + vm_page_sunbusy(ma[i]); + break; + } + + resid -= PAGE_SIZE; + off += PAGE_SIZE; + } + if (i == 0) { + error = EJUSTRETURN; + goto out_pip; + } + + /* + * Check VIRF_DOOMED after we busied our pages. Since + * vgonel() terminates the vnode' vm_object, it cannot + * process past pages busied by us. + */ + if (VN_IS_DOOMED(vp)) { + error = EJUSTRETURN; + goto out; + } + + resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1); + if (resid > uio->uio_resid) + resid = uio->uio_resid; + + /* + * Unlocked read of vnp_size is safe because truncation cannot + * pass busied page. But we load vnp_size into a local + * variable so that possible concurrent extension does not + * break calculation. + */ +#if defined(__powerpc__) && !defined(__powerpc64__) + vsz = object->un_pager.vnp.vnp_size; +#else + vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size); +#endif + if (uio->uio_offset + resid > vsz) + resid = vsz - uio->uio_offset; + + error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio); + +out: + for (j = 0; j < i; j++) { + if (error == 0) + vm_page_reference(ma[j]); + vm_page_sunbusy(ma[j]); + } +out_pip: + vm_object_pip_wakeup(obj); + if (error != 0) + return (error); + return (uio->uio_resid == 0 ? 0 : EJUSTRETURN); +} + +static bool +do_vn_read_from_pgcache(struct vnode *vp, struct uio *uio, struct file *fp) +{ + return ((vp->v_irflag & VIRF_PGREAD) != 0 && + !mac_vnode_check_read_enabled() && + uio->uio_resid <= ptoa(io_hold_cnt) && uio->uio_offset >= 0 && + (fp->f_flag & O_DIRECT) == 0 && vn_io_pgcache_read_enable); +} + /* * File table vnode read routine. */ @@ -860,6 +976,15 @@ uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; + if (do_vn_read_from_pgcache(vp, uio, fp)) { + error = vn_read_from_obj(vp, uio); + if (error == 0) { + fp->f_nextoff[UIO_READ] = uio->uio_offset; + return (0); + } + if (error != EJUSTRETURN) + return (error); + } ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; @@ -1164,8 +1289,8 @@ uio_clone->uio_iovcnt--; continue; } - if (len > io_hold_cnt * PAGE_SIZE) - len = io_hold_cnt * PAGE_SIZE; + if (len > ptoa(io_hold_cnt)) + len = ptoa(io_hold_cnt); addr = (uintptr_t)uio_clone->uio_iov->iov_base; end = round_page(addr + len); if (end < addr) { Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -244,6 +244,8 @@ #define VHOLD_ALL_FLAGS (VHOLD_NO_SMR) #define VIRF_DOOMED 0x0001 /* This vnode is being recycled */ +#define VIRF_PGREAD 0x0002 /* Direct reads from the page cache are permitted, + never cleared once set */ #define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */ #define VI_MOUNT 0x0002 /* Mount in progress */ Index: sys/ufs/ufs/ufs_vnops.c =================================================================== --- sys/ufs/ufs/ufs_vnops.c +++ sys/ufs/ufs/ufs_vnops.c @@ -282,13 +282,20 @@ return (EOPNOTSUPP); ip = VTOI(vp); + vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td); + if (vp->v_type == VREG && (vp->v_irflag & VIRF_PGREAD) == 0) { + VI_LOCK(vp); + vp->v_irflag |= VIRF_PGREAD; + VI_UNLOCK(vp); + } + /* * Files marked append-only must be opened for appending. */ if ((ip->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); - vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td); + return (0); } Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -192,9 +192,11 @@ ("object %p has reservations", object)); #endif +#if 0 KASSERT(blockcount_read(&object->paging_in_progress) == 0, ("object %p paging_in_progress = %d", object, blockcount_read(&object->paging_in_progress))); +#endif KASSERT(!vm_object_busied(object), ("object %p busy = %d", object, blockcount_read(&object->busy))); KASSERT(object->resident_page_count == 0, @@ -294,6 +296,9 @@ * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. + * + * paging_in_progress is valid always. Lockless references to + * the objects may acquire pip and then check OBJ_DEAD. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS @@ -936,17 +941,14 @@ ("terminating shadow obj %p", object)); /* - * wait for the pageout daemon to be done with the object + * Wait for the pageout daemon and other current users to be + * done with the object. Note that new paging_in_progress + * users can come after this wait, but they must check + * OBJ_DEAD flag set (without unlocking the object), and avoid + * the object being terminated. */ vm_object_pip_wait(object, "objtrm"); - KASSERT(!blockcount_read(&object->paging_in_progress), - ("vm_object_terminate: pageout in progress")); - - KASSERT(object->ref_count == 0, - ("vm_object_terminate: object with references, ref_count=%d", - object->ref_count)); - if ((object->flags & OBJ_PG_DTOR) == 0) vm_object_terminate_pages(object); Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -520,7 +520,11 @@ vm_page_xunbusy(m); } out: +#if defined(__powerpc__) && !defined(__powerpc64__) object->un_pager.vnp.vnp_size = nsize; +#else + atomic_store_64(&object->un_pager.vnp.vnp_size, nsize); +#endif object->size = nobjsize; VM_OBJECT_WUNLOCK(object); }