Index: sys/fs/nullfs/null_vfsops.c =================================================================== --- sys/fs/nullfs/null_vfsops.c +++ sys/fs/nullfs/null_vfsops.c @@ -195,7 +195,8 @@ } mp->mnt_kern_flag |= MNTK_LOOKUP_EXCL_DOTDOT | MNTK_NOMSYNC; mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & - (MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS); + (MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | + MNTK_PGCACHE_READ); MNT_IUNLOCK(mp); vfs_getnewfsid(mp); if ((xmp->nullm_flags & NULLM_CACHE) != 0) { Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -132,6 +132,10 @@ static int vn_io_fault_prefault = 0; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW, &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); +static int vn_io_pgcache_read_enable = 1; +SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RW, + &vn_io_pgcache_read_enable, 0, + "Enable copying from page cache for reads, avoiding fs"); static u_long vn_io_faults_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); @@ -844,6 +848,97 @@ return (ret); } +static int +vn_read_from_obj(struct vnode *vp, struct uio *uio) +{ + vm_object_t obj; + vm_page_t ma[io_hold_cnt + 2]; + off_t off, vsz; + ssize_t resid; + int error, i, j; + + MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2)); + + /* + * XXXKIB Does not enable page cache read for tmpfs. There is + * no way to get tmpfs vnode size without vnode lock, needed + * for clipping read on the last page. + */ + obj = vp->v_object; + if (obj == NULL || obj->type != OBJT_VNODE) + return (EJUSTRETURN); + + resid = uio->uio_resid; + off = uio->uio_offset; + for (i = 0; resid > 0; i++) { + MPASS(i < io_hold_cnt + 2); + ma[i] = vm_page_grab_unlocked(obj, atop(off), + VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | + (i == 0 ? 0 : VM_ALLOC_NOWAIT)); + if (ma[i] == NULL) + break; + + /* + * Skip invalid pages. Valid mask can be partial only + * at EOF, and we clip later. + */ + if (ma[i]->valid == 0) { + vm_page_sunbusy(ma[i]); + i--; + } + + resid -= PAGE_SIZE; + off += PAGE_SIZE; + } + if (i == 0) + return (EJUSTRETURN); + + /* + * Check VIRF_DOOMED after we busied our pages. Since + * vgonel() terminates the vnode' vm_object, it cannot + * process past pages busied by us. + */ + if (VN_IS_DOOMED(vp)) { + error = EJUSTRETURN; + goto out; + } + + resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + atop(i - 1); + if (resid > uio->uio_resid) + resid = uio->uio_resid; + + /* + * Unlocked read of vnp_size is safe because truncation cannot + * pass busied page. But we load vnp_size into a local + * variable so that possible concurrent extension does not + * break calculation. + */ + vsz = obj->un_pager.vnp.vnp_size; + if (uio->uio_offset + resid > vsz) + resid = vsz - uio->uio_offset; + + error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio); + +out: + for (j = 0; j < i; j++) + vm_page_sunbusy(ma[j]); + if (error != 0) + return (error); + return (uio->uio_resid == 0 ? 0 : EJUSTRETURN); +} + +static bool +do_vn_read_from_pgcache(struct vnode *vp, struct uio *uio, struct file *fp) +{ + struct mount *mp; + + return (vp->v_type == VREG && (mp = vp->v_mount) != NULL && + (mp->mnt_kern_flag & MNTK_PGCACHE_READ) != 0 && + !mac_vnode_check_read_enabled() && + uio->uio_resid <= ptoa(io_hold_cnt) && uio->uio_offset >= 0 && + (fp->f_flag & O_DIRECT) == 0 && vn_io_pgcache_read_enable); +} + /* * File table vnode read routine. */ @@ -860,6 +955,15 @@ uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; + if (do_vn_read_from_pgcache(vp, uio, fp)) { + error = vn_read_from_obj(vp, uio); + if (error == 0) { + fp->f_nextoff[UIO_READ] = uio->uio_offset; + return (0); + } + if (error != EJUSTRETURN) + return (error); + } ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; @@ -1164,8 +1268,8 @@ uio_clone->uio_iovcnt--; continue; } - if (len > io_hold_cnt * PAGE_SIZE) - len = io_hold_cnt * PAGE_SIZE; + if (len > ptoa(io_hold_cnt)) + len = ptoa(io_hold_cnt); addr = (uintptr_t)uio_clone->uio_iov->iov_base; end = round_page(addr + len); if (end < addr) { Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h +++ sys/sys/mount.h @@ -421,6 +421,7 @@ #define MNTK_VMSETSIZE_BUG 0x00010000 #define MNTK_UNIONFS 0x00020000 /* A hack for F_ISUNIONSTACK */ #define MNTK_FPLOOKUP 0x00040000 /* fast path lookup is supported */ +#define MNTK_PGCACHE_READ 0x00080000 /* vn_read tries pgcache first */ #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ Index: sys/ufs/ffs/ffs_vfsops.c =================================================================== --- sys/ufs/ffs/ffs_vfsops.c +++ sys/ufs/ffs/ffs_vfsops.c @@ -1296,7 +1296,8 @@ */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | - MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE; + MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE | + MNTK_PGCACHE_READ; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART