Page MenuHomeFreeBSD

D25968.id75751.diff
No OneTemporary

D25968.id75751.diff

Index: sys/fs/nullfs/null_subr.c
===================================================================
--- sys/fs/nullfs/null_subr.c
+++ sys/fs/nullfs/null_subr.c
@@ -258,6 +258,26 @@
if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp)
vp->v_vflag |= VV_ROOT;
+ /*
+ * We might miss the case where lower vnode sets VIRF_PGREAD
+ * some time after construction, which is typical case.
+ * null_open rechecks.
+ */
+ if ((lowervp->v_irflag & VIRF_PGREAD) != 0) {
+ MPASS(lowervp->v_object != NULL);
+ if ((vp->v_irflag & VIRF_PGREAD) == 0) {
+ if (vp->v_object == NULL)
+ vp->v_object = lowervp->v_object;
+ else
+ MPASS(vp->v_object == lowervp->v_object);
+ VI_LOCK(vp);
+ vp->v_irflag |= VIRF_PGREAD;
+ VI_UNLOCK(vp);
+ } else {
+ MPASS(vp->v_object != NULL);
+ }
+ }
+
/*
* Atomically insert our new node into the hash or vget existing
* if someone else has beaten us to it.
Index: sys/fs/nullfs/null_vnops.c
===================================================================
--- sys/fs/nullfs/null_vnops.c
+++ sys/fs/nullfs/null_vnops.c
@@ -439,8 +439,17 @@
vp = ap->a_vp;
ldvp = NULLVPTOLOWERVP(vp);
retval = null_bypass(&ap->a_gen);
- if (retval == 0)
+ if (retval == 0) {
vp->v_object = ldvp->v_object;
+ if ((ldvp->v_irflag & VIRF_PGREAD) != 0) {
+ MPASS(vp->v_object != NULL);
+ if ((vp->v_irflag & VIRF_PGREAD) == 0) {
+ VI_LOCK(vp);
+ vp->v_irflag |= VIRF_PGREAD;
+ VI_UNLOCK(vp);
+ }
+ }
+ }
return (retval);
}
Index: sys/kern/vfs_subr.c
===================================================================
--- sys/kern/vfs_subr.c
+++ sys/kern/vfs_subr.c
@@ -4211,7 +4211,9 @@
buf[1] = '\0';
if (vp->v_irflag & VIRF_DOOMED)
strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
- flags = vp->v_irflag & ~(VIRF_DOOMED);
+ if (vp->v_irflag & VIRF_PGREAD)
+ strlcat(buf, "|VIRF_PGREAD", sizeof(buf));
+ flags = vp->v_irflag & ~(VIRF_DOOMED | VIRF_PGREAD);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
strlcat(buf, buf2, sizeof(buf));
Index: sys/kern/vfs_vnops.c
===================================================================
--- sys/kern/vfs_vnops.c
+++ sys/kern/vfs_vnops.c
@@ -127,11 +127,15 @@
static const int io_hold_cnt = 16;
static int vn_io_fault_enable = 1;
-SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
+SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
&vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
static int vn_io_fault_prefault = 0;
-SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW,
+SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
&vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
+static int vn_io_pgcache_read_enable = 1;
+SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
+ &vn_io_pgcache_read_enable, 0,
+ "Enable copying from page cache for reads, avoiding fs");
static u_long vn_io_faults_cnt;
SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
&vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
@@ -844,6 +848,118 @@
return (ret);
}
+static int
+vn_read_from_obj(struct vnode *vp, struct uio *uio)
+{
+ vm_object_t obj;
+ vm_page_t ma[io_hold_cnt + 2];
+ off_t off, vsz;
+ ssize_t resid;
+ int error, i, j;
+
+ obj = vp->v_object;
+ MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
+ MPASS(obj != NULL);
+ MPASS(obj->type == VREG);
+
+ /*
+ * Depends on type stability of vm_objects.
+ */
+ vm_object_pip_add(obj, 1);
+ if ((obj->flags & OBJ_DEAD) != 0) {
+ /*
+ * Note that object might be already reused from the
+ * vnode, and the OBJ_DEAD flag cleared. This is fine,
+ * we recheck for DOOMED vnode state after all pages
+ * are busied, and retract then.
+ *
+ * But we check for OBJ_DEAD to ensure that we do not
+ * busy pages while vm_object_terminate_pages()
+ * processes the queue.
+ */
+ error = EJUSTRETURN;
+ goto out_pip;
+ }
+
+ resid = uio->uio_resid;
+ off = uio->uio_offset;
+ for (i = 0; resid > 0; i++) {
+ MPASS(i < io_hold_cnt + 2);
+ ma[i] = vm_page_grab_unlocked(obj, atop(off),
+ VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
+ VM_ALLOC_NOWAIT);
+ if (ma[i] == NULL)
+ break;
+
+ /*
+ * Skip invalid pages. Valid mask can be partial only
+ * at EOF, and we clip later.
+ */
+ if (vm_page_none_valid(ma[i])) {
+ vm_page_sunbusy(ma[i]);
+ break;
+ }
+
+ resid -= PAGE_SIZE;
+ off += PAGE_SIZE;
+ }
+ if (i == 0) {
+ error = EJUSTRETURN;
+ goto out_pip;
+ }
+
+ /*
+ * Check VIRF_DOOMED after we busied our pages. Since
+ * vgonel() terminates the vnode' vm_object, it cannot
+ * process past pages busied by us.
+ */
+ if (VN_IS_DOOMED(vp)) {
+ error = EJUSTRETURN;
+ goto out;
+ }
+
+ resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
+ if (resid > uio->uio_resid)
+ resid = uio->uio_resid;
+
+ /*
+ * Unlocked read of vnp_size is safe because truncation cannot
+ * pass busied page. But we load vnp_size into a local
+ * variable so that possible concurrent extension does not
+ * break calculation.
+ */
+#if defined(__powerpc__) && !defined(__powerpc64__)
+ vsz = object->un_pager.vnp.vnp_size;
+#else
+ vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
+#endif
+ if (uio->uio_offset + resid > vsz)
+ resid = vsz - uio->uio_offset;
+
+ error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
+
+out:
+ for (j = 0; j < i; j++) {
+ if (error == 0)
+ vm_page_reference(ma[j]);
+ vm_page_sunbusy(ma[j]);
+ }
+out_pip:
+ vm_object_pip_wakeup(obj);
+ if (error != 0)
+ return (error);
+ return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
+}
+
+static bool
+do_vn_read_from_pgcache(struct vnode *vp, struct uio *uio, struct file *fp)
+{
+ return ((vp->v_irflag & VIRF_PGREAD) != 0 &&
+ !mac_vnode_check_read_enabled() &&
+ uio->uio_resid <= ptoa(io_hold_cnt) && uio->uio_offset >= 0 &&
+ (fp->f_flag & O_DIRECT) == 0 && vn_io_pgcache_read_enable);
+}
+
/*
* File table vnode read routine.
*/
@@ -860,6 +976,15 @@
uio->uio_td, td));
KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
vp = fp->f_vnode;
+ if (do_vn_read_from_pgcache(vp, uio, fp)) {
+ error = vn_read_from_obj(vp, uio);
+ if (error == 0) {
+ fp->f_nextoff[UIO_READ] = uio->uio_offset;
+ return (0);
+ }
+ if (error != EJUSTRETURN)
+ return (error);
+ }
ioflag = 0;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
@@ -1164,8 +1289,8 @@
uio_clone->uio_iovcnt--;
continue;
}
- if (len > io_hold_cnt * PAGE_SIZE)
- len = io_hold_cnt * PAGE_SIZE;
+ if (len > ptoa(io_hold_cnt))
+ len = ptoa(io_hold_cnt);
addr = (uintptr_t)uio_clone->uio_iov->iov_base;
end = round_page(addr + len);
if (end < addr) {
Index: sys/sys/vnode.h
===================================================================
--- sys/sys/vnode.h
+++ sys/sys/vnode.h
@@ -244,6 +244,8 @@
#define VHOLD_ALL_FLAGS (VHOLD_NO_SMR)
#define VIRF_DOOMED 0x0001 /* This vnode is being recycled */
+#define VIRF_PGREAD 0x0002 /* Direct reads from the page cache are permitted,
+ never cleared once set */
#define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */
#define VI_MOUNT 0x0002 /* Mount in progress */
Index: sys/ufs/ufs/ufs_vnops.c
===================================================================
--- sys/ufs/ufs/ufs_vnops.c
+++ sys/ufs/ufs/ufs_vnops.c
@@ -282,13 +282,20 @@
return (EOPNOTSUPP);
ip = VTOI(vp);
+ vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td);
+ if (vp->v_type == VREG && (vp->v_irflag & VIRF_PGREAD) == 0) {
+ VI_LOCK(vp);
+ vp->v_irflag |= VIRF_PGREAD;
+ VI_UNLOCK(vp);
+ }
+
/*
* Files marked append-only must be opened for appending.
*/
if ((ip->i_flags & APPEND) &&
(ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
return (EPERM);
- vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td);
+
return (0);
}
Index: sys/vm/vm_object.c
===================================================================
--- sys/vm/vm_object.c
+++ sys/vm/vm_object.c
@@ -192,9 +192,11 @@
("object %p has reservations",
object));
#endif
+#if 0
KASSERT(blockcount_read(&object->paging_in_progress) == 0,
("object %p paging_in_progress = %d",
object, blockcount_read(&object->paging_in_progress)));
+#endif
KASSERT(!vm_object_busied(object),
("object %p busy = %d", object, blockcount_read(&object->busy)));
KASSERT(object->resident_page_count == 0,
@@ -294,6 +296,9 @@
* The lock portion of struct vm_object must be type stable due
* to vm_pageout_fallback_object_lock locking a vm object
* without holding any references to it.
+ *
+ * paging_in_progress is valid always. Lockless references to
+ * the objects may acquire pip and then check OBJ_DEAD.
*/
obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
#ifdef INVARIANTS
@@ -936,13 +941,14 @@
("terminating shadow obj %p", object));
/*
- * wait for the pageout daemon to be done with the object
+ * Wait for the pageout daemon and other current users to be
+ * done with the object. Note that new paging_in_progress
+ * users can come after this wait, but they must check
+ * OBJ_DEAD flag set (without unlocking the object), and avoid
+ * the object being terminated.
*/
vm_object_pip_wait(object, "objtrm");
- KASSERT(!blockcount_read(&object->paging_in_progress),
- ("vm_object_terminate: pageout in progress"));
-
KASSERT(object->ref_count == 0,
("vm_object_terminate: object with references, ref_count=%d",
object->ref_count));
Index: sys/vm/vnode_pager.c
===================================================================
--- sys/vm/vnode_pager.c
+++ sys/vm/vnode_pager.c
@@ -520,7 +520,11 @@
vm_page_xunbusy(m);
}
out:
+#if defined(__powerpc__) && !defined(__powerpc64__)
object->un_pager.vnp.vnp_size = nsize;
+#else
+ atomic_store_64(&object->un_pager.vnp.vnp_size, nsize);
+#endif
object->size = nobjsize;
VM_OBJECT_WUNLOCK(object);
}

File Metadata

Mime Type
text/plain
Expires
Fri, Nov 21, 10:57 PM (5 h, 29 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
25821861
Default Alt Text
D25968.id75751.diff (9 KB)

Event Timeline