Index: sys/kern/vfs_bio.c =================================================================== --- sys/kern/vfs_bio.c +++ sys/kern/vfs_bio.c @@ -1786,6 +1786,8 @@ bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, + ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* @@ -2056,6 +2058,8 @@ pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } + if ((bp->b_flags & B_NOREUSE) != 0) + bp->b_flags |= B_RELBUF; } /* @@ -2143,10 +2147,15 @@ freed = false; if (!freed) { /* - * In order to maintain LRU page ordering, put - * the page at the tail of the inactive queue. + * If the page is unlikely to be reused, let the + * VM know. Otherwise, maintain LRU page + * ordering and put the page at the tail of the + * inactive queue. */ - vm_page_deactivate(m); + if ((bp->b_flags & B_NOREUSE) != 0) + vm_page_deactivate_noreuse(m); + else + vm_page_deactivate(m); } } vm_page_unlock(m); @@ -2157,7 +2166,7 @@ if (bp->b_bufsize) bufspaceadjust(bp, 0); bp->b_npages = 0; - bp->b_flags &= ~B_VMIO; + bp->b_flags &= ~(B_VMIO | B_NOREUSE); } /* Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -1034,9 +1034,12 @@ int vop_stdadvise(struct vop_advise_args *ap) { + struct buf *bp; + struct buflists *bl; struct vnode *vp; + daddr_t bn, startn, endn; off_t start, end; - int error; + int bsize, error; vp = ap->a_vp; switch (ap->a_advice) { @@ -1049,28 +1052,60 @@ error = 0; break; case POSIX_FADV_DONTNEED: - /* - * Flush any open FS buffers and then remove pages - * from the backing VM object. Using vinvalbuf() here - * is a bit heavy-handed as it flushes all buffers for - * the given vnode, not just the buffers covering the - * requested range. - */ error = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); break; } - vinvalbuf(vp, V_CLEANONLY, 0, 0); + + /* + * Deactivate pages in the specified range from the backing VM + * object. Pages that are resident in the buffer cache will + * remain wired until their corresponding buffers are released + * below. + */ if (vp->v_object != NULL) { start = trunc_page(ap->a_start); end = round_page(ap->a_end); VM_OBJECT_WLOCK(vp->v_object); - vm_object_page_cache(vp->v_object, OFF_TO_IDX(start), + vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start), OFF_TO_IDX(end)); VM_OBJECT_WUNLOCK(vp->v_object); } + + BO_RLOCK(&vp->v_bufobj); + bsize = vp->v_bufobj.bo_bsize; + startn = ap->a_start / bsize; + if (ap->a_end == OFF_MAX) { + endn = -1; + bl = &vp->v_bufobj.bo_clean.bv_hd; + if (!TAILQ_EMPTY(bl)) + endn = TAILQ_LAST(bl, buflists)->b_lblkno; + bl = &vp->v_bufobj.bo_dirty.bv_hd; + if (!TAILQ_EMPTY(bl) && + endn < TAILQ_LAST(bl, buflists)->b_lblkno) + endn = TAILQ_LAST(bl, buflists)->b_lblkno; + } else + endn = ap->a_end / bsize; + BO_RUNLOCK(&vp->v_bufobj); + /* + * In the VMIO case, use the B_NOREUSE flag to hint that the + * pages backing each buffer in the range are unlikely to be + * reused. Dirty buffers will have the hint applied once + * they've been written. + */ + for (bn = startn; bn <= endn; bn++) { + bp = getblk(vp, bn, bsize, 0, 0, GB_NOCREAT | + GB_UNMAPPED); + if (bp == NULL) + continue; + if (vp->v_object != NULL) + bp->b_flags |= B_NOREUSE; + if ((bp->b_flags & B_DELWRI) == 0) + bp->b_flags |= B_RELBUF; + brelse(bp); + } VOP_UNLOCK(vp, 0); break; default: Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c +++ sys/kern/vfs_syscalls.c @@ -4610,8 +4610,6 @@ new->fa_advice = advice; new->fa_start = offset; new->fa_end = end; - new->fa_prevstart = 0; - new->fa_prevend = 0; fp->f_advice = new; new = fa; } Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -762,18 +762,12 @@ * File table vnode read routine. */ static int -vn_read(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - int flags; - struct thread *td; +vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; - struct mtx *mtxp; - int error, ioflag; - int advice; - off_t offset, start, end; + off_t orig_offset; + int advice, error, ioflag; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -797,7 +791,7 @@ /* Disable read-ahead for random I/O. */ break; } - offset = uio->uio_offset; + orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); @@ -807,39 +801,14 @@ fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); if (error == 0 && advice == POSIX_FADV_NOREUSE && - offset != uio->uio_offset) { + orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush clean pages and * buffers for the backing file after a - * POSIX_FADV_NOREUSE read(2). To optimize the common - * case of using POSIX_FADV_NOREUSE with sequential - * access, track the previous implicit DONTNEED - * request and grow this request to include the - * current read(2) in addition to the previous - * DONTNEED. With purely sequential access this will - * cause the DONTNEED requests to continously grow to - * cover all of the previously read regions of the - * file. This allows filesystem blocks that are - * accessed by multiple calls to read(2) to be flushed - * once the last read(2) finishes. + * POSIX_FADV_NOREUSE read(2). */ - start = offset; - end = uio->uio_offset - 1; - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - if (fp->f_advice != NULL && - fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { - if (start != 0 && fp->f_advice->fa_prevend + 1 == start) - start = fp->f_advice->fa_prevstart; - else if (fp->f_advice->fa_prevstart != 0 && - fp->f_advice->fa_prevstart == end + 1) - end = fp->f_advice->fa_prevend; - fp->f_advice->fa_prevstart = start; - fp->f_advice->fa_prevend = end; - } - mtx_unlock(mtxp); - error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); - } + error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, + POSIX_FADV_DONTNEED); return (error); } @@ -847,19 +816,13 @@ * File table vnode write routine. */ static int -vn_write(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - int flags; - struct thread *td; +vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; struct mount *mp; - struct mtx *mtxp; - int error, ioflag, lock_flags; - int advice; - off_t offset, start, end; + off_t orig_offset; + int advice, error, ioflag, lock_flags; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -902,7 +865,7 @@ /* XXX: Is this correct? */ break; } - offset = uio->uio_offset; + orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); @@ -914,55 +877,14 @@ if (vp->v_type != VCHR) vn_finished_write(mp); if (error == 0 && advice == POSIX_FADV_NOREUSE && - offset != uio->uio_offset) { + orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush clean pages and * buffers for the backing file after a - * POSIX_FADV_NOREUSE write(2). To optimize the - * common case of using POSIX_FADV_NOREUSE with - * sequential access, track the previous implicit - * DONTNEED request and grow this request to include - * the current write(2) in addition to the previous - * DONTNEED. With purely sequential access this will - * cause the DONTNEED requests to continously grow to - * cover all of the previously written regions of the - * file. - * - * Note that the blocks just written are almost - * certainly still dirty, so this only works when - * VOP_ADVISE() calls from subsequent writes push out - * the data written by this write(2) once the backing - * buffers are clean. However, as compared to forcing - * IO_DIRECT, this gives much saner behavior. Write - * clustering is still allowed, and clean pages are - * merely moved to the cache page queue rather than - * outright thrown away. This means a subsequent - * read(2) can still avoid hitting the disk if the - * pages have not been reclaimed. - * - * This does make POSIX_FADV_NOREUSE largely useless - * with non-sequential access. However, sequential - * access is the more common use case and the flag is - * merely advisory. + * POSIX_FADV_NOREUSE write(2). */ - start = offset; - end = uio->uio_offset - 1; - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - if (fp->f_advice != NULL && - fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { - if (start != 0 && fp->f_advice->fa_prevend + 1 == start) - start = fp->f_advice->fa_prevstart; - else if (fp->f_advice->fa_prevstart != 0 && - fp->f_advice->fa_prevstart == end + 1) - end = fp->f_advice->fa_prevend; - fp->f_advice->fa_prevstart = start; - fp->f_advice->fa_prevend = end; - } - mtx_unlock(mtxp); - error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); - } - + error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, + POSIX_FADV_DONTNEED); unlock: return (error); } Index: sys/sys/buf.h =================================================================== --- sys/sys/buf.h +++ sys/sys/buf.h @@ -204,7 +204,7 @@ #define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ -#define B_00000800 0x00000800 /* Available flag. */ +#define B_NOREUSE 0x00000800 /* Won't be reused once released. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceeding first. */ @@ -229,7 +229,7 @@ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26dirty\25b20" \ "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ - "\15b12\14b11\13eintr\12done\11persist\10delwri" \ + "\15b12\14noreuse\13eintr\12done\11persist\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* Index: sys/sys/file.h =================================================================== --- sys/sys/file.h +++ sys/sys/file.h @@ -160,8 +160,6 @@ int fa_advice; /* (f) FADV_* type. */ off_t fa_start; /* (f) Region start. */ off_t fa_end; /* (f) Region end. */ - off_t fa_prevstart; /* (f) Previous NOREUSE start. */ - off_t fa_prevend; /* (f) Previous NOREUSE end. */ }; struct file { Index: sys/vm/vm_object.h =================================================================== --- sys/vm/vm_object.h +++ sys/vm/vm_object.h @@ -304,10 +304,10 @@ void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int); -void vm_object_page_cache(vm_object_t object, vm_pindex_t start, - vm_pindex_t end); boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); +void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, + vm_pindex_t end); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options); boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t); Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -1963,15 +1963,15 @@ } /* - * vm_object_page_cache: + * vm_object_page_noreuse: * - * For the given object, attempt to move the specified clean - * pages to the cache queue. If a page is wired for any reason, - * then it will not be changed. Pages are specified by the given - * range ["start", "end"). As a special case, if "end" is zero, - * then the range extends from "start" to the end of the object. - * Any mappings to the specified pages are removed before the - * pages are moved to the cache queue. + * For the given object, attempt to move the specified pages to + * the head of the inactive queue. This bypasses regular LRU + * operation and allows the pages to be reused quickly under memory + * pressure. If a page is wired for any reason, then it will not + * be queued. Pages are specified by the range ["start", "end"). + * As a special case, if "end" is zero, then the range extends from + * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. @@ -1979,21 +1979,21 @@ * The object must be locked. */ void -vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end) +vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx, *new_mtx; vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, - ("vm_object_page_cache: illegal object %p", object)); + ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex - * greater than or equal to the parameter "start" or (2) NULL. + * greater than or equal to the parameter "start" or (2) NULL. */ mtx = NULL; for (; p != NULL && (p->pindex < end || end == 0); p = next) { @@ -2009,7 +2009,7 @@ mtx = new_mtx; mtx_lock(mtx); } - vm_page_try_to_cache(p); + vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); Index: sys/vm/vm_page.h =================================================================== --- sys/vm/vm_page.h +++ sys/vm/vm_page.h @@ -451,6 +451,7 @@ int vm_page_try_to_cache (vm_page_t); int vm_page_try_to_free (vm_page_t); void vm_page_deactivate (vm_page_t); +void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); void vm_page_dequeue_locked(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -2589,6 +2589,19 @@ } /* + * Move the specified page to the inactive queue with the expectation + * that it is unlikely to be reused. + * + * The page must be locked. + */ +void +vm_page_deactivate_noreuse(vm_page_t m) +{ + + _vm_page_deactivate(m, 1); +} + +/* * vm_page_try_to_cache: * * Returns 0 on failure, 1 on success @@ -2740,8 +2753,7 @@ /* * vm_page_advise * - * Deactivate or do nothing, as appropriate. This routine is used - * by madvise() and vop_stdadvise(). + * Deactivate or do nothing, as appropriate. * * The object and page must be locked. */