Index: head/sys/ufs/ffs/ffs_vnops.c =================================================================== --- head/sys/ufs/ffs/ffs_vnops.c +++ head/sys/ufs/ffs/ffs_vnops.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,7 @@ #include #include #include +#include #include #include @@ -102,8 +104,9 @@ #ifdef DIRECTIO extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); #endif -static vop_fsync_t ffs_fsync; static vop_fdatasync_t ffs_fdatasync; +static vop_fsync_t ffs_fsync; +static vop_getpages_t ffs_getpages; static vop_lock1_t ffs_lock; static vop_read_t ffs_read; static vop_write_t ffs_write; @@ -119,13 +122,12 @@ static vop_setextattr_t ffs_setextattr; static vop_vptofh_t ffs_vptofh; - /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops1 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, - .vop_getpages = vnode_pager_local_getpages, + .vop_getpages = ffs_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, .vop_read = ffs_read, @@ -147,7 +149,7 @@ .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, - .vop_getpages = vnode_pager_local_getpages, + .vop_getpages = ffs_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, .vop_read = ffs_read, @@ -1784,3 +1786,165 @@ ufhp->ufid_gen = ip->i_gen; return (0); } + +SYSCTL_DECL(_vfs_ffs); +static int use_buf_pager = 1; +SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, + "Always use buffer pager instead of bmap"); +static int buf_pager_relbuf; +SYSCTL_INT(_vfs_ffs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, + &buf_pager_relbuf, 0, + "Make buffer pager release buffers after reading"); + +/* + * The FFS pager. It uses buffer reads to validate pages. + * + * In contrast to the generic local pager from vm/vnode_pager.c, this + * pager correctly and easily handles volumes where the underlying + * device block size is greater than the machine page size. The + * buffer cache transparently extends the requested page run to be + * aligned at the block boundary, and does the necessary bogus page + * replacements in the addends to avoid obliterating already valid + * pages. + * + * The only non-trivial issue is that the exclusive busy state for + * pages, which is assumed by the vm_pager_getpages() interface, is + * incompatible with the VMIO buffer cache's desire to share-busy the + * pages. This function performs a trivial downgrade of the pages' + * state before reading buffers, and a less trivial upgrade from the + * shared-busy to excl-busy state after the read. + */ +static int +ffs_getpages(struct vop_getpages_args *ap) +{ + struct vnode *vp; + vm_page_t *ma, m; + vm_object_t object; + struct buf *bp; + struct ufsmount *um; + ufs_lbn_t lbn, lbnp; + vm_ooffset_t la, lb; + long bsize; + int bo_bs, count, error, i; + bool redo, lpart; + + vp = ap->a_vp; + ma = ap->a_m; + count = ap->a_count; + + um = VFSTOUFS(ap->a_vp->v_mount); + bo_bs = um->um_devvp->v_bufobj.bo_bsize; + if (!use_buf_pager && bo_bs <= PAGE_SIZE) + return (vnode_pager_generic_getpages(vp, ma, count, + ap->a_rbehind, ap->a_rahead, NULL, NULL)); + + object = vp->v_object; + la = IDX_TO_OFF(ma[count - 1]->pindex); + if (la >= object->un_pager.vnp.vnp_size) + return (VM_PAGER_BAD); + lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size; + if (ap->a_rbehind != NULL) { + lb = IDX_TO_OFF(ma[0]->pindex); + *ap->a_rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); + } + if (ap->a_rahead != NULL) { + *ap->a_rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la); + if (la + IDX_TO_OFF(*ap->a_rahead) >= + object->un_pager.vnp.vnp_size) { + *ap->a_rahead = OFF_TO_IDX(roundup2(object->un_pager. + vnp.vnp_size, PAGE_SIZE) - la); + } + } + VM_OBJECT_WLOCK(object); +again: + for (i = 0; i < count; i++) + vm_page_busy_downgrade(ma[i]); + VM_OBJECT_WUNLOCK(object); + + lbnp = -1; + for (i = 0; i < count; i++) { + m = ma[i]; + + /* + * Pages are shared busy and the object lock is not + * owned, which together allow for the pages' + * invalidation. The racy test for validity avoids + * useless creation of the buffer for the most typical + * case when invalidation is not used in redo or for + * parallel read. The shared->excl upgrade loop at + * the end of the function catches the race in a + * reliable way (protected by the object lock). + */ + if (m->valid == VM_PAGE_BITS_ALL) + continue; + + lbn = lblkno(um->um_fs, IDX_TO_OFF(m->pindex)); + if (lbn != lbnp) { + bsize = blksize(um->um_fs, VTOI(vp), lbn); + error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED, + &bp); + if (error != 0) + break; + KASSERT(1 /* racy, enable for debugging */ || + m->valid == VM_PAGE_BITS_ALL || i == count - 1, + ("buf %d %p invalid", i, m)); + if (i == count - 1 && lpart) { + VM_OBJECT_WLOCK(object); + if (m->valid != 0 && + m->valid != VM_PAGE_BITS_ALL) + vm_page_zero_invalid(m, TRUE); + VM_OBJECT_WUNLOCK(object); + } + if (LIST_EMPTY(&bp->b_dep)) { + /* + * Invalidation clears m->valid, but + * may leave B_CACHE flag if the + * buffer existed at the invalidation + * time. In this case, recycle the + * buffer to do real read on next + * bread() after redo. + * + * Otherwise B_RELBUF is not strictly + * necessary, enable to reduce buf + * cache pressure. + */ + if (buf_pager_relbuf || + m->valid != VM_PAGE_BITS_ALL) + bp->b_flags |= B_RELBUF; + + bp->b_flags &= ~B_NOCACHE; + brelse(bp); + } else { + bqrelse(bp); + } + lbnp = lbn; + } + } + + VM_OBJECT_WLOCK(object); + redo = false; + for (i = 0; i < count; i++) { + vm_page_sunbusy(ma[i]); + ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); + + /* + * Since the pages were only sbusy while neither the + * buffer nor the object lock was held by us, or + * reallocated while vm_page_grab() slept for busy + * relinguish, they could have been invalidated. + * Recheck the valid bits and re-read as needed. + * + * Note that the last page is made fully valid in the + * read loop, and partial validity for the page at + * index count - 1 could mean that the page was + * invalidated or removed, so we must restart for + * safety as well. + */ + if (ma[i]->valid != VM_PAGE_BITS_ALL) + redo = true; + } + if (redo && error == 0) + goto again; + VM_OBJECT_WUNLOCK(object); + return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); +}