Changeset View
Standalone View
sys/kern/vfs_vnops.c
Show First 20 Lines • Show All 121 Lines • ▼ Show 20 Lines | struct fileops vnops = { | ||||
.fo_fill_kinfo = vn_fill_kinfo, | .fo_fill_kinfo = vn_fill_kinfo, | ||||
.fo_mmap = vn_mmap, | .fo_mmap = vn_mmap, | ||||
.fo_fallocate = vn_fallocate, | .fo_fallocate = vn_fallocate, | ||||
.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE | .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE | ||||
}; | }; | ||||
static const int io_hold_cnt = 16; | static const int io_hold_cnt = 16; | ||||
static int vn_io_fault_enable = 1; | static int vn_io_fault_enable = 1; | ||||
SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, | SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN, | ||||
&vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); | &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); | ||||
static int vn_io_fault_prefault = 0; | static int vn_io_fault_prefault = 0; | ||||
SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW, | SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN, | ||||
&vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); | &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); | ||||
static int vn_io_pgcache_read_enable = 1; | |||||
SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN, | |||||
markj: Should it be RWTUN? Same for the sysctls above. | |||||
Done Inline ActionsI did not found a use for it since vn_io_fault_enable was introduced as an emergency switch. But I do not see why not. kib: I did not found a use for it since vn_io_fault_enable was introduced as an emergency switch. | |||||
&vn_io_pgcache_read_enable, 0, | |||||
"Enable copying from page cache for reads, avoiding fs"); | |||||
static u_long vn_io_faults_cnt; | static u_long vn_io_faults_cnt; | ||||
SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, | SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, | ||||
&vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); | &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); | ||||
static int vfs_allow_read_dir = 0; | static int vfs_allow_read_dir = 0; | ||||
SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW, | SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW, | ||||
&vfs_allow_read_dir, 0, | &vfs_allow_read_dir, 0, | ||||
"Enable read(2) of directory by root for filesystems that support it"); | "Enable read(2) of directory by root for filesystems that support it"); | ||||
Show All 20 Lines | struct vn_io_fault_args { | ||||
enum { | enum { | ||||
VN_IO_FAULT_FOP, | VN_IO_FAULT_FOP, | ||||
VN_IO_FAULT_VOP | VN_IO_FAULT_VOP | ||||
} kind; | } kind; | ||||
struct ucred *cred; | struct ucred *cred; | ||||
int flags; | int flags; | ||||
union { | union { | ||||
struct fop_args_tag { | struct fop_args_tag { | ||||
struct file *fp; | struct file *fp; | ||||
Done Inline ActionsCan we elide most of this branchfest? I suspect most of these branches can be reworked into a flag for v_irflag. mjg: Can we elide most of this branchfest? I suspect most of these branches can be reworked into a… | |||||
Done Inline ActionsI think it would be premature optimization. Also, it is not much that can be collapsed into single flag there, perhaps I can move MNTK_PGCACHE_READ into vnode irflag, but that's it. kib: I think it would be premature optimization. Also, it is not much that can be collapsed into… | |||||
Done Inline ActionsHaving it in irflag would eliminated VREG and mp != NULL check (along with mp read) which i think is a nice win, assuming it's not problematic. mjg: Having it in irflag would eliminated VREG and mp != NULL check (along with mp read) which i… | |||||
fo_rdwr_t *doio; | fo_rdwr_t *doio; | ||||
} fop_args; | } fop_args; | ||||
struct vop_args_tag { | struct vop_args_tag { | ||||
struct vnode *vp; | struct vnode *vp; | ||||
} vop_args; | } vop_args; | ||||
} args; | } args; | ||||
}; | }; | ||||
▲ Show 20 Lines • Show All 659 Lines • ▼ Show 20 Lines | get_advice(struct file *fp, struct uio *uio) | ||||
if (fp->f_advice != NULL && | if (fp->f_advice != NULL && | ||||
uio->uio_offset >= fp->f_advice->fa_start && | uio->uio_offset >= fp->f_advice->fa_start && | ||||
uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) | uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) | ||||
ret = fp->f_advice->fa_advice; | ret = fp->f_advice->fa_advice; | ||||
mtx_unlock(mtxp); | mtx_unlock(mtxp); | ||||
return (ret); | return (ret); | ||||
} | } | ||||
static int | |||||
vn_read_from_obj(struct vnode *vp, struct uio *uio) | |||||
{ | |||||
vm_object_t obj; | |||||
vm_page_t ma[io_hold_cnt + 2]; | |||||
off_t off, vsz; | |||||
ssize_t resid; | |||||
int error, i, j; | |||||
obj = vp->v_object; | |||||
MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2)); | |||||
MPASS(obj != NULL); | |||||
MPASS(obj->type == VREG); | |||||
/* | /* | ||||
Done Inline ActionsI would add /* Depends on type-stability. */ here. markj: I would add `/* Depends on type-stability. */` here. | |||||
* Depends on type stability of vm_objects. | |||||
*/ | |||||
vm_object_pip_add(obj, 1); | |||||
if ((obj->flags & OBJ_DEAD) != 0) { | |||||
/* | |||||
Done Inline ActionsI'm confused about all of this. The vnode has a reference to the object. We are reading from a reference to a vnode. Is the vnode shared locked here? This is all to handle a race with forced unmount? jeff: I'm confused about all of this. The vnode has a reference to the object. We are reading from a… | |||||
Done Inline ActionsNo, the vnode is not locked, we only have a reference to it thought struct file f_vnode. The vnode content is read-locked by rangelocks for read range, so we get the expected read atomicity. kib: No, the vnode is not locked, we only have a reference to it thought struct file f_vnode.
It… | |||||
Done Inline ActionsI see ok. Well it really is a shame how much extra synchronization and complexity is required for forced unmount. Orthogonal to this patch, but maybe we should rethink the way that works to avoid all of this. Possibly something closer to write suspension. jeff: I see ok. Well it really is a shame how much extra synchronization and complexity is required… | |||||
* Note that object might be already reused from the | |||||
* vnode, and the OBJ_DEAD flag cleared. This is fine, | |||||
* we recheck for DOOMED vnode state after all pages | |||||
* are busied, and retract then. | |||||
Done Inline ActionsHrm, vm_object_terminate_pages() also asserts paging_in_progress == 0. markj: Hrm, vm_object_terminate_pages() also asserts paging_in_progress == 0. | |||||
* | |||||
Done Inline ActionsIs it possible to use vm_page_grab_pages_unlocked() here? We will potentially grab a run of invalid pages, but I do not see why it can't be handled. markj: Is it possible to use vm_page_grab_pages_unlocked() here? We will potentially grab a run of… | |||||
Done Inline ActionsI tried and I do not like the result. The saving of the code lines in the initial loop is eaten by the need to calculate the run length twice, first to see how many pages grab, then to iterate and cut it for invalid pages. Also busying any invalid page after the first is just a waste. kib: I tried and I do not like the result. The saving of the code lines in the initial loop is… | |||||
* But we check for OBJ_DEAD to ensure that we do not | |||||
* busy pages while vm_object_terminate_pages() | |||||
Done Inline ActionsDon't we need to revalidate the object identity after busying pages? Suppose we slept on an xbusy page, I don't see any guarantee that the object was not reallocated in the meantime. markj: Don't we need to revalidate the object identity after busying pages? Suppose we slept on an… | |||||
Done Inline ActionsThis is handled by the DOOMED check. The object cannot be deallocated before the vnode is marked as DOOMED, and we bail out if we noted that. kib: This is handled by the DOOMED check. The object cannot be deallocated before the vnode is… | |||||
Done Inline ActionsBut it's still possible that we are grabbing pages from an arbitrary object, right? I think we must never block in that case, i.e., VM_ALLOC_NOWAIT must be specified for all pages in the range. Consider that we may allocate objects for kernel memory in which pages are permanently xbusy. Kernel stacks are an example. Before r360354 kernel stack objects were dynamically allocated. markj: But it's still possible that we are grabbing pages from an arbitrary object, right? I think we… | |||||
* processes the queue. | |||||
*/ | |||||
error = EJUSTRETURN; | |||||
goto out_pip; | |||||
} | |||||
resid = uio->uio_resid; | |||||
off = uio->uio_offset; | |||||
Done Inline ActionsUse vm_page_none_valid(). markj: Use `vm_page_none_valid()`. | |||||
for (i = 0; resid > 0; i++) { | |||||
MPASS(i < io_hold_cnt + 2); | |||||
ma[i] = vm_page_grab_unlocked(obj, atop(off), | |||||
VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | | |||||
VM_ALLOC_NOWAIT); | |||||
if (ma[i] == NULL) | |||||
break; | |||||
Done Inline ActionsNote that vm_object_page_remove() does not remove wired pages from the object, it merely marks them invalid. So we may temporarily busy an invalid page after vm_object_page_remove() has passed it. Then we may race with vm_object_terminate_pages(), which asserts that all resident pages are unbusy. vm_object_terminate_pages() removes pages from their object, which is an operation protected by the busy lock. So I don't think we can correctly weaken the assertion. markj: Note that vm_object_page_remove() does not remove wired pages from the object, it merely marks… | |||||
Done Inline ActionsI wonder why vm_object_page_remove() does not remove wired pages from the object. markj: I wonder why vm_object_page_remove() does not remove wired pages from the object. | |||||
Done Inline ActionsBecause it causes other issues. In short, there is an architectural contradiction between user wiring and msync(MS_INVALIDATE). On one hand, we must keep the wired pages around, e.g. otherwise vm_object_unwire() panic with 'missed page'. On the other hand, MS_INVALIDATE works by doing vm_object_page_remove(). So the solution that fixed a lot of breakage was to keep wired pages but invalidate them. I am still not sure that everything was fixed, but definitely a lot. kib: Because it causes other issues. In short, there is an architectural contradiction between user… | |||||
Done Inline ActionsHmm, right. We prohibit msync(MS_INVALIDATE) on an mlock()ed region, but the mapping may be mlock()ed into a different address space. I will say that keeping the wired pages also introduces a fair bit of complexity. r345382 is one example, but I remember that there were others in the past. markj: Hmm, right. We prohibit msync(MS_INVALIDATE) on an mlock()ed region, but the mapping may be… | |||||
/* | |||||
* Skip invalid pages. Valid mask can be partial only | |||||
* at EOF, and we clip later. | |||||
*/ | |||||
if (vm_page_none_valid(ma[i])) { | |||||
vm_page_sunbusy(ma[i]); | |||||
Done Inline ActionsAre you referring to the vm_object_page_clean() call in vnode_destroy_vobject()? Termination assumes that the object's pages are unbusied. How do we guarantee that we are not busying pages after vm_object_page_clean() was already called? markj: Are you referring to the vm_object_page_clean() call in vnode_destroy_vobject()? Termination… | |||||
Done Inline Actionsvnode_destroy_vobject()->vinvalbuf()->bufobj_invalbuf()->vm_object_page_remove() which waits to obtain xbusy before removing page from the queue. kib: vnode_destroy_vobject()->vinvalbuf()->bufobj_invalbuf()->vm_object_page_remove() which waits to… | |||||
Done Inline ActionsDo we need some fence here, and after VIRF_DOOMED is set in vgonel()? markj: Do we need some fence here, and after VIRF_DOOMED is set in vgonel()? | |||||
Done Inline ActionsArguably the busy acquire above sync/w xunbusy in vm_object_remove_pages(), which gives us required visibility. kib: Arguably the busy acquire above sync/w xunbusy in vm_object_remove_pages(), which gives us… | |||||
break; | |||||
} | |||||
resid -= PAGE_SIZE; | |||||
off += PAGE_SIZE; | |||||
} | |||||
if (i == 0) { | |||||
Done Inline ActionsShouldn't it be ptoa()? markj: Shouldn't it be ptoa()? | |||||
Done Inline ActionsHuh, yes. It worked because short cache read (uio->uio_resid != 0) falls back to VOP_READ() by design. I will re-measure after this fix. kib: Huh, yes. It worked because short cache read (uio->uio_resid != 0) falls back to VOP_READ() by… | |||||
error = EJUSTRETURN; | |||||
goto out_pip; | |||||
} | |||||
/* | |||||
Done Inline ActionsExtra space before ;. markj: Extra space before `;`. | |||||
* Check VIRF_DOOMED after we busied our pages. Since | |||||
* vgonel() terminates the vnode' vm_object, it cannot | |||||
* process past pages busied by us. | |||||
*/ | |||||
if (VN_IS_DOOMED(vp)) { | |||||
Done Inline ActionsPresumably this should be atomic_load. markj: Presumably this should be atomic_load. | |||||
Done Inline Actionshmm perhaps, but then this should be limited to LP64 arches. I will think about it. kib: hmm perhaps, but then this should be limited to LP64 arches.
I will think about it. | |||||
error = EJUSTRETURN; | |||||
goto out; | |||||
} | |||||
resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1); | |||||
if (resid > uio->uio_resid) | |||||
Done Inline ActionsWe should ensure that this counts as a reference with respect to LRU. I think there are basically two things we could do: call vm_page_deactivate(), or call vm_page_reference(). The first will trigger a requeue of the page, so it will go to the tail of the inactive queue in the near future. This is basically what happens when a page is evicted from the buffer cache. vm_page_reference() merely sets a flag that will tell the page daemon to requeue or reactivate the page, so it is a "lazy" requeue. In particular it does not preserve LRU, but it is cheaper. To be more specific about the behaviour of vm_page_reference(): when an inactive scan encounters a page with PGA_REFERENCED set, and the object has ref_count > 0, the page is moved to the active queue. If ref_count == 0, it is requeued. So, if the page is mapped elsewhere, it will be activated. markj: We should ensure that this counts as a reference with respect to LRU.
I think there are… | |||||
Done Inline ActionsSuppose that the page is active, what would vm_page_deactivate() do ? From my reading of vm_page_mvqueue(), it would schedule the page move to inactive queue (same as before the scheduling stuff), right ? I do not think this is fair. Buffer cache deactivation of the page on buffer reclamation is supported by some specific use case, and the fact that we loose all page access history when the buffer is formed. More, the page is kept resident for guaranteed long time. In other words, I do not think that deactivation is fair. OTOH, I do not see a problem with vm_page_reference(). kib: Suppose that the page is active, what would vm_page_deactivate() do ? From my reading of… | |||||
Done Inline ActionsRight, vm_page_deactivate() will unconditionally move the page to the inactive queue. vm_page_unwire(PQ_INACTIVE) actually has friendlier semantics: if the page is in the active queue, PGA_REFERENCED is set and the page is left in the active queue. We should probably have a function unrelated to wiring that implements this behaviour. In the meantime, I think using vm_page_reference() is fine. markj: Right, vm_page_deactivate() will unconditionally move the page to the inactive queue. | |||||
resid = uio->uio_resid; | |||||
/* | |||||
* Unlocked read of vnp_size is safe because truncation cannot | |||||
* pass busied page. But we load vnp_size into a local | |||||
* variable so that possible concurrent extension does not | |||||
* break calculation. | |||||
*/ | |||||
#if defined(__powerpc__) && !defined(__powerpc64__) | |||||
vsz = object->un_pager.vnp.vnp_size; | |||||
#else | |||||
vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size); | |||||
#endif | |||||
if (uio->uio_offset + resid > vsz) | |||||
resid = vsz - uio->uio_offset; | |||||
error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio); | |||||
markjUnsubmitted Done Inline ActionsCan't we handle requests larger than ptoa(io_hold_cnt) by looping with the object PIP held? markj: Can't we handle requests larger than `ptoa(io_hold_cnt)` by looping with the object PIP held? | |||||
kibAuthorUnsubmitted Done Inline ActionsWe can, but I am not sure it is much useful. Might be we could not do pgcache read for !vniofault, but this seems to be too arbitrary. kib: We can, but I am not sure it is much useful.
vniofault code already chunks io into io_hold_cnt… | |||||
markjUnsubmitted Not Done Inline ActionsThanks, I see now. markj: Thanks, I see now. | |||||
out: | |||||
for (j = 0; j < i; j++) { | |||||
if (error == 0) | |||||
vm_page_reference(ma[j]); | |||||
vm_page_sunbusy(ma[j]); | |||||
} | |||||
out_pip: | |||||
vm_object_pip_wakeup(obj); | |||||
if (error != 0) | |||||
return (error); | |||||
return (uio->uio_resid == 0 ? 0 : EJUSTRETURN); | |||||
} | |||||
static bool | |||||
do_vn_read_from_pgcache(struct vnode *vp, struct uio *uio, struct file *fp) | |||||
{ | |||||
return ((vp->v_irflag & VIRF_PGREAD) != 0 && | |||||
!mac_vnode_check_read_enabled() && | |||||
uio->uio_resid <= ptoa(io_hold_cnt) && uio->uio_offset >= 0 && | |||||
(fp->f_flag & O_DIRECT) == 0 && vn_io_pgcache_read_enable); | |||||
} | |||||
/* | |||||
* File table vnode read routine. | * File table vnode read routine. | ||||
*/ | */ | ||||
static int | static int | ||||
vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, | vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, | ||||
struct thread *td) | struct thread *td) | ||||
{ | { | ||||
struct vnode *vp; | struct vnode *vp; | ||||
off_t orig_offset; | off_t orig_offset; | ||||
int error, ioflag; | int error, ioflag; | ||||
int advice; | int advice; | ||||
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", | KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", | ||||
uio->uio_td, td)); | uio->uio_td, td)); | ||||
KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); | KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); | ||||
vp = fp->f_vnode; | vp = fp->f_vnode; | ||||
if (do_vn_read_from_pgcache(vp, uio, fp)) { | |||||
error = vn_read_from_obj(vp, uio); | |||||
Done Inline Actionsuntested but roughly: diff --git a/sys/security/mac/mac_framework.h b/sys/security/mac/mac_framework.h index 70a7aad44757..bcc3af03b22d 100644 --- a/sys/security/mac/mac_framework.h +++ b/sys/security/mac/mac_framework.h @@ -541,13 +541,14 @@ mac_vnode_check_stat(struct ucred *active_cred, struct ucred *file_cred, int mac_vnode_check_read_impl(struct ucred *active_cred, struct ucred *file_cred, struct vnode *vp); extern bool mac_vnode_check_read_fp_flag; +#define mac_vnode_check_read_enabled() __predict_false(mac_vnode_check_read_fp_flag) static inline int mac_vnode_check_read(struct ucred *active_cred, struct ucred *file_cred, struct vnode *vp) { mac_vnode_assert_locked(vp, "mac_vnode_check_read"); - if (__predict_false(mac_vnode_check_read_fp_flag)) + if (mac_vnode_check_read_enabled()) return (mac_vnode_check_read_impl(active_cred, file_cred, vp)); return (0); } and then mac_vnode_check_read_enabled above mjg: untested but roughly:
```
diff --git a/sys/security/mac/mac_framework.h… | |||||
Done Inline ActionsAfter r363935 you can mac_vnode_check_read_enabled() with and without MAC compiled in. mjg: After r363935 you can mac_vnode_check_read_enabled() with and without MAC compiled in. | |||||
if (error == 0) { | |||||
fp->f_nextoff[UIO_READ] = uio->uio_offset; | |||||
return (0); | |||||
} | |||||
if (error != EJUSTRETURN) | |||||
return (error); | |||||
} | |||||
ioflag = 0; | ioflag = 0; | ||||
if (fp->f_flag & FNONBLOCK) | if (fp->f_flag & FNONBLOCK) | ||||
ioflag |= IO_NDELAY; | ioflag |= IO_NDELAY; | ||||
if (fp->f_flag & O_DIRECT) | if (fp->f_flag & O_DIRECT) | ||||
ioflag |= IO_DIRECT; | ioflag |= IO_DIRECT; | ||||
advice = get_advice(fp, uio); | advice = get_advice(fp, uio); | ||||
vn_lock(vp, LK_SHARED | LK_RETRY); | vn_lock(vp, LK_SHARED | LK_RETRY); | ||||
▲ Show 20 Lines • Show All 288 Lines • ▼ Show 20 Lines | while (uio_clone->uio_resid != 0) { | ||||
len = uio_clone->uio_iov->iov_len; | len = uio_clone->uio_iov->iov_len; | ||||
if (len == 0) { | if (len == 0) { | ||||
KASSERT(uio_clone->uio_iovcnt >= 1, | KASSERT(uio_clone->uio_iovcnt >= 1, | ||||
("iovcnt underflow")); | ("iovcnt underflow")); | ||||
uio_clone->uio_iov++; | uio_clone->uio_iov++; | ||||
uio_clone->uio_iovcnt--; | uio_clone->uio_iovcnt--; | ||||
continue; | continue; | ||||
} | } | ||||
if (len > io_hold_cnt * PAGE_SIZE) | if (len > ptoa(io_hold_cnt)) | ||||
len = io_hold_cnt * PAGE_SIZE; | len = ptoa(io_hold_cnt); | ||||
addr = (uintptr_t)uio_clone->uio_iov->iov_base; | addr = (uintptr_t)uio_clone->uio_iov->iov_base; | ||||
end = round_page(addr + len); | end = round_page(addr + len); | ||||
if (end < addr) { | if (end < addr) { | ||||
error = EFAULT; | error = EFAULT; | ||||
break; | break; | ||||
} | } | ||||
cnt = atop(end - trunc_page(addr)); | cnt = atop(end - trunc_page(addr)); | ||||
/* | /* | ||||
▲ Show 20 Lines • Show All 1,986 Lines • Show Last 20 Lines |
Should it be RWTUN? Same for the sysctls above.