Differential D25968 Diff 75751 sys/kern/vfs_vnops.c

Changeset View

Standalone View

sys/kern/vfs_vnops.c

Show First 20 Lines • Show All 121 Lines • ▼ Show 20 Lines	struct fileops vnops = {
.fo_fill_kinfo = vn_fill_kinfo,		.fo_fill_kinfo = vn_fill_kinfo,
.fo_mmap = vn_mmap,		.fo_mmap = vn_mmap,
.fo_fallocate = vn_fallocate,		.fo_fallocate = vn_fallocate,
.fo_flags = DFLAG_PASSABLE \| DFLAG_SEEKABLE		.fo_flags = DFLAG_PASSABLE \| DFLAG_SEEKABLE
};		};

static const int io_hold_cnt = 16;		static const int io_hold_cnt = 16;
static int vn_io_fault_enable = 1;		static int vn_io_fault_enable = 1;
SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,		SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
&vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");		&vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
static int vn_io_fault_prefault = 0;		static int vn_io_fault_prefault = 0;
SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW,		SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
&vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");		&vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
		static int vn_io_pgcache_read_enable = 1;
		SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
		markjUnsubmitted Done Inline Actions Should it be RWTUN? Same for the sysctls above. markj: Should it be RWTUN? Same for the sysctls above.
		kibAuthorUnsubmitted Done Inline Actions I did not found a use for it since vn_io_fault_enable was introduced as an emergency switch. But I do not see why not. kib: I did not found a use for it since vn_io_fault_enable was introduced as an emergency switch.
		&vn_io_pgcache_read_enable, 0,
		"Enable copying from page cache for reads, avoiding fs");
static u_long vn_io_faults_cnt;		static u_long vn_io_faults_cnt;
SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,		SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
&vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");		&vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");

static int vfs_allow_read_dir = 0;		static int vfs_allow_read_dir = 0;
SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,		SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
&vfs_allow_read_dir, 0,		&vfs_allow_read_dir, 0,
"Enable read(2) of directory by root for filesystems that support it");		"Enable read(2) of directory by root for filesystems that support it");
Show All 20 Lines	struct vn_io_fault_args {
enum {		enum {
VN_IO_FAULT_FOP,		VN_IO_FAULT_FOP,
VN_IO_FAULT_VOP		VN_IO_FAULT_VOP
} kind;		} kind;
struct ucred *cred;		struct ucred *cred;
int flags;		int flags;
union {		union {
struct fop_args_tag {		struct fop_args_tag {
struct file *fp;		struct file *fp;
		mjgUnsubmitted Done Inline Actions Can we elide most of this branchfest? I suspect most of these branches can be reworked into a flag for v_irflag. mjg: Can we elide most of this branchfest? I suspect most of these branches can be reworked into a…
		kibAuthorUnsubmitted Done Inline Actions I think it would be premature optimization. Also, it is not much that can be collapsed into single flag there, perhaps I can move MNTK_PGCACHE_READ into vnode irflag, but that's it. kib: I think it would be premature optimization. Also, it is not much that can be collapsed into…
		mjgUnsubmitted Done Inline Actions Having it in irflag would eliminated VREG and mp != NULL check (along with mp read) which i think is a nice win, assuming it's not problematic. mjg: Having it in irflag would eliminated VREG and mp != NULL check (along with mp read) which i…
fo_rdwr_t *doio;		fo_rdwr_t *doio;
} fop_args;		} fop_args;
struct vop_args_tag {		struct vop_args_tag {
struct vnode *vp;		struct vnode *vp;
} vop_args;		} vop_args;
} args;		} args;
};		};

▲ Show 20 Lines • Show All 659 Lines • ▼ Show 20 Lines	get_advice(struct file fp, struct uio uio)
if (fp->f_advice != NULL &&		if (fp->f_advice != NULL &&
uio->uio_offset >= fp->f_advice->fa_start &&		uio->uio_offset >= fp->f_advice->fa_start &&
uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)		uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
ret = fp->f_advice->fa_advice;		ret = fp->f_advice->fa_advice;
mtx_unlock(mtxp);		mtx_unlock(mtxp);
return (ret);		return (ret);
}		}

		static int
		vn_read_from_obj(struct vnode vp, struct uio uio)
		{
		vm_object_t obj;
		vm_page_t ma[io_hold_cnt + 2];
		off_t off, vsz;
		ssize_t resid;
		int error, i, j;

		obj = vp->v_object;
		MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
		MPASS(obj != NULL);
		MPASS(obj->type == VREG);

/*		/*
		markjUnsubmitted Done Inline Actions I would add `/* Depends on type-stability. /` here. markj:* I would add `/* Depends on type-stability. */` here.
		* Depends on type stability of vm_objects.
		*/
		vm_object_pip_add(obj, 1);
		if ((obj->flags & OBJ_DEAD) != 0) {
		/*
		jeffUnsubmitted Done Inline Actions I'm confused about all of this. The vnode has a reference to the object. We are reading from a reference to a vnode. Is the vnode shared locked here? This is all to handle a race with forced unmount? jeff: I'm confused about all of this. The vnode has a reference to the object. We are reading from a…
		kibAuthorUnsubmitted Done Inline Actions No, the vnode is not locked, we only have a reference to it thought struct file f_vnode. It handles forced unmounts in particular. The vnode content is read-locked by rangelocks for read range, so we get the expected read atomicity. kib: No, the vnode is not locked, we only have a reference to it thought struct file f_vnode. It…
		jeffUnsubmitted Done Inline Actions I see ok. Well it really is a shame how much extra synchronization and complexity is required for forced unmount. Orthogonal to this patch, but maybe we should rethink the way that works to avoid all of this. Possibly something closer to write suspension. jeff: I see ok. Well it really is a shame how much extra synchronization and complexity is required…
		* Note that object might be already reused from the
		* vnode, and the OBJ_DEAD flag cleared. This is fine,
		* we recheck for DOOMED vnode state after all pages
		* are busied, and retract then.
		markjUnsubmitted Done Inline Actions Hrm, vm_object_terminate_pages() also asserts paging_in_progress == 0. markj: Hrm, vm_object_terminate_pages() also asserts paging_in_progress == 0.
		*
		markjUnsubmitted Done Inline Actions Is it possible to use vm_page_grab_pages_unlocked() here? We will potentially grab a run of invalid pages, but I do not see why it can't be handled. markj: Is it possible to use vm_page_grab_pages_unlocked() here? We will potentially grab a run of…
		kibAuthorUnsubmitted Done Inline Actions I tried and I do not like the result. The saving of the code lines in the initial loop is eaten by the need to calculate the run length twice, first to see how many pages grab, then to iterate and cut it for invalid pages. Also busying any invalid page after the first is just a waste. kib: I tried and I do not like the result. The saving of the code lines in the initial loop is…
		* But we check for OBJ_DEAD to ensure that we do not
		* busy pages while vm_object_terminate_pages()
		markjUnsubmitted Done Inline Actions Don't we need to revalidate the object identity after busying pages? Suppose we slept on an xbusy page, I don't see any guarantee that the object was not reallocated in the meantime. markj: Don't we need to revalidate the object identity after busying pages? Suppose we slept on an…
		kibAuthorUnsubmitted Done Inline Actions This is handled by the DOOMED check. The object cannot be deallocated before the vnode is marked as DOOMED, and we bail out if we noted that. kib: This is handled by the DOOMED check. The object cannot be deallocated before the vnode is…
		markjUnsubmitted Done Inline Actions But it's still possible that we are grabbing pages from an arbitrary object, right? I think we must never block in that case, i.e., VM_ALLOC_NOWAIT must be specified for all pages in the range. Consider that we may allocate objects for kernel memory in which pages are permanently xbusy. Kernel stacks are an example. Before r360354 kernel stack objects were dynamically allocated. markj: But it's still possible that we are grabbing pages from an arbitrary object, right? I think we…
		* processes the queue.
		*/
		error = EJUSTRETURN;
		goto out_pip;
		}

		resid = uio->uio_resid;
		off = uio->uio_offset;
		markjUnsubmitted Done Inline Actions Use `vm_page_none_valid()`. markj: Use `vm_page_none_valid()`.
		for (i = 0; resid > 0; i++) {
		MPASS(i < io_hold_cnt + 2);
		ma[i] = vm_page_grab_unlocked(obj, atop(off),
		VM_ALLOC_NOCREAT \| VM_ALLOC_SBUSY \| VM_ALLOC_IGN_SBUSY \|
		VM_ALLOC_NOWAIT);
		if (ma[i] == NULL)
		break;

		markjUnsubmitted Done Inline Actions Note that vm_object_page_remove() does not remove wired pages from the object, it merely marks them invalid. So we may temporarily busy an invalid page after vm_object_page_remove() has passed it. Then we may race with vm_object_terminate_pages(), which asserts that all resident pages are unbusy. vm_object_terminate_pages() removes pages from their object, which is an operation protected by the busy lock. So I don't think we can correctly weaken the assertion. markj: Note that vm_object_page_remove() does not remove wired pages from the object, it merely marks…
		markjUnsubmitted Done Inline Actions I wonder why vm_object_page_remove() does not remove wired pages from the object. markj: I wonder why vm_object_page_remove() does not remove wired pages from the object.
		kibAuthorUnsubmitted Done Inline Actions Because it causes other issues. In short, there is an architectural contradiction between user wiring and msync(MS_INVALIDATE). On one hand, we must keep the wired pages around, e.g. otherwise vm_object_unwire() panic with 'missed page'. On the other hand, MS_INVALIDATE works by doing vm_object_page_remove(). So the solution that fixed a lot of breakage was to keep wired pages but invalidate them. I am still not sure that everything was fixed, but definitely a lot. kib: Because it causes other issues. In short, there is an architectural contradiction between user…
		markjUnsubmitted Done Inline Actions Hmm, right. We prohibit msync(MS_INVALIDATE) on an mlock()ed region, but the mapping may be mlock()ed into a different address space. I will say that keeping the wired pages also introduces a fair bit of complexity. r345382 is one example, but I remember that there were others in the past. markj: Hmm, right. We prohibit msync(MS_INVALIDATE) on an mlock()ed region, but the mapping may be…
		/*
		* Skip invalid pages. Valid mask can be partial only
		* at EOF, and we clip later.
		*/
		if (vm_page_none_valid(ma[i])) {
		vm_page_sunbusy(ma[i]);
		markjUnsubmitted Done Inline Actions Are you referring to the vm_object_page_clean() call in vnode_destroy_vobject()? Termination assumes that the object's pages are unbusied. How do we guarantee that we are not busying pages after vm_object_page_clean() was already called? markj: Are you referring to the vm_object_page_clean() call in vnode_destroy_vobject()? Termination…
		kibAuthorUnsubmitted Done Inline Actions vnode_destroy_vobject()->vinvalbuf()->bufobj_invalbuf()->vm_object_page_remove() which waits to obtain xbusy before removing page from the queue. kib: vnode_destroy_vobject()->vinvalbuf()->bufobj_invalbuf()->vm_object_page_remove() which waits to…
		markjUnsubmitted Done Inline Actions Do we need some fence here, and after VIRF_DOOMED is set in vgonel()? markj: Do we need some fence here, and after VIRF_DOOMED is set in vgonel()?
		kibAuthorUnsubmitted Done Inline Actions Arguably the busy acquire above sync/w xunbusy in vm_object_remove_pages(), which gives us required visibility. kib: Arguably the busy acquire above sync/w xunbusy in vm_object_remove_pages(), which gives us…
		break;
		}

		resid -= PAGE_SIZE;
		off += PAGE_SIZE;
		}
		if (i == 0) {
		markjUnsubmitted Done Inline Actions Shouldn't it be ptoa()? markj: Shouldn't it be ptoa()?
		kibAuthorUnsubmitted Done Inline Actions Huh, yes. It worked because short cache read (uio->uio_resid != 0) falls back to VOP_READ() by design. I will re-measure after this fix. kib: Huh, yes. It worked because short cache read (uio->uio_resid != 0) falls back to VOP_READ() by…
		error = EJUSTRETURN;
		goto out_pip;
		}

		/*
		markjUnsubmitted Done Inline Actions Extra space before `;`. markj: Extra space before `;`.
		* Check VIRF_DOOMED after we busied our pages. Since
		* vgonel() terminates the vnode' vm_object, it cannot
		* process past pages busied by us.
		*/
		if (VN_IS_DOOMED(vp)) {
		markjUnsubmitted Done Inline Actions Presumably this should be atomic_load. markj: Presumably this should be atomic_load.
		kibAuthorUnsubmitted Done Inline Actions hmm perhaps, but then this should be limited to LP64 arches. I will think about it. kib: hmm perhaps, but then this should be limited to LP64 arches. I will think about it.
		error = EJUSTRETURN;
		goto out;
		}

		resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
		if (resid > uio->uio_resid)
		markjUnsubmitted Done Inline Actions We should ensure that this counts as a reference with respect to LRU. I think there are basically two things we could do: call vm_page_deactivate(), or call vm_page_reference(). The first will trigger a requeue of the page, so it will go to the tail of the inactive queue in the near future. This is basically what happens when a page is evicted from the buffer cache. vm_page_reference() merely sets a flag that will tell the page daemon to requeue or reactivate the page, so it is a "lazy" requeue. In particular it does not preserve LRU, but it is cheaper. To be more specific about the behaviour of vm_page_reference(): when an inactive scan encounters a page with PGA_REFERENCED set, and the object has ref_count > 0, the page is moved to the active queue. If ref_count == 0, it is requeued. So, if the page is mapped elsewhere, it will be activated. markj: We should ensure that this counts as a reference with respect to LRU. I think there are…
		kibAuthorUnsubmitted Done Inline Actions Suppose that the page is active, what would vm_page_deactivate() do ? From my reading of vm_page_mvqueue(), it would schedule the page move to inactive queue (same as before the scheduling stuff), right ? I do not think this is fair. Buffer cache deactivation of the page on buffer reclamation is supported by some specific use case, and the fact that we loose all page access history when the buffer is formed. More, the page is kept resident for guaranteed long time. In other words, I do not think that deactivation is fair. OTOH, I do not see a problem with vm_page_reference(). kib: Suppose that the page is active, what would vm_page_deactivate() do ? From my reading of…
		markjUnsubmitted Done Inline Actions Right, vm_page_deactivate() will unconditionally move the page to the inactive queue. vm_page_unwire(PQ_INACTIVE) actually has friendlier semantics: if the page is in the active queue, PGA_REFERENCED is set and the page is left in the active queue. We should probably have a function unrelated to wiring that implements this behaviour. In the meantime, I think using vm_page_reference() is fine. markj: Right, vm_page_deactivate() will unconditionally move the page to the inactive queue.
		resid = uio->uio_resid;

		/*
		* Unlocked read of vnp_size is safe because truncation cannot
		* pass busied page. But we load vnp_size into a local
		* variable so that possible concurrent extension does not
		* break calculation.
		*/
		#if defined(__powerpc__) && !defined(__powerpc64__)
		vsz = object->un_pager.vnp.vnp_size;
		#else
		vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
		#endif
		if (uio->uio_offset + resid > vsz)
		resid = vsz - uio->uio_offset;

		error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
		markjUnsubmitted Done Inline Actions Can't we handle requests larger than `ptoa(io_hold_cnt)` by looping with the object PIP held? markj: Can't we handle requests larger than `ptoa(io_hold_cnt)` by looping with the object PIP held?
		kibAuthorUnsubmitted Done Inline Actions We can, but I am not sure it is much useful. vniofault code already chunks io into io_hold_cnt blocks, so for the normal case where vniofault is enabled, we would never see large block. The check is to do something with !vniofault. Might be we could not do pgcache read for !vniofault, but this seems to be too arbitrary. kib: We can, but I am not sure it is much useful. vniofault code already chunks io into io_hold_cnt…
		markjUnsubmitted Not Done Inline Actions Thanks, I see now. markj: Thanks, I see now.

		out:
		for (j = 0; j < i; j++) {
		if (error == 0)
		vm_page_reference(ma[j]);
		vm_page_sunbusy(ma[j]);
		}
		out_pip:
		vm_object_pip_wakeup(obj);
		if (error != 0)
		return (error);
		return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
		}

		static bool
		do_vn_read_from_pgcache(struct vnode vp, struct uio uio, struct file *fp)
		{
		return ((vp->v_irflag & VIRF_PGREAD) != 0 &&
		!mac_vnode_check_read_enabled() &&
		uio->uio_resid <= ptoa(io_hold_cnt) && uio->uio_offset >= 0 &&
		(fp->f_flag & O_DIRECT) == 0 && vn_io_pgcache_read_enable);
		}

		/*
* File table vnode read routine.		* File table vnode read routine.
*/		*/
static int		static int
vn_read(struct file fp, struct uio uio, struct ucred *active_cred, int flags,		vn_read(struct file fp, struct uio uio, struct ucred *active_cred, int flags,
struct thread *td)		struct thread *td)
{		{
struct vnode *vp;		struct vnode *vp;
off_t orig_offset;		off_t orig_offset;
int error, ioflag;		int error, ioflag;
int advice;		int advice;

KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",		KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));		uio->uio_td, td));
KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));		KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
vp = fp->f_vnode;		vp = fp->f_vnode;
		if (do_vn_read_from_pgcache(vp, uio, fp)) {
		error = vn_read_from_obj(vp, uio);
		mjgUnsubmitted Done Inline Actions untested but roughly: diff --git a/sys/security/mac/mac_framework.h b/sys/security/mac/mac_framework.h index 70a7aad44757..bcc3af03b22d 100644 --- a/sys/security/mac/mac_framework.h +++ b/sys/security/mac/mac_framework.h @@ -541,13 +541,14 @@ mac_vnode_check_stat(struct ucred active_cred, struct ucred file_cred, int mac_vnode_check_read_impl(struct ucred active_cred, struct ucred file_cred, struct vnode vp); extern bool mac_vnode_check_read_fp_flag; +#define mac_vnode_check_read_enabled() __predict_false(mac_vnode_check_read_fp_flag) static inline int mac_vnode_check_read(struct ucred active_cred, struct ucred file_cred, struct vnode vp) { mac_vnode_assert_locked(vp, "mac_vnode_check_read"); - if (__predict_false(mac_vnode_check_read_fp_flag)) + if (mac_vnode_check_read_enabled()) return (mac_vnode_check_read_impl(active_cred, file_cred, vp)); return (0); } and then mac_vnode_check_read_enabled above mjg: untested but roughly: ``` diff --git a/sys/security/mac/mac_framework.h…
		mjgUnsubmitted Done Inline Actions After r363935 you can mac_vnode_check_read_enabled() with and without MAC compiled in. mjg: After r363935 you can mac_vnode_check_read_enabled() with and without MAC compiled in.
		if (error == 0) {
		fp->f_nextoff[UIO_READ] = uio->uio_offset;
		return (0);
		}
		if (error != EJUSTRETURN)
		return (error);
		}
ioflag = 0;		ioflag = 0;
if (fp->f_flag & FNONBLOCK)		if (fp->f_flag & FNONBLOCK)
ioflag \|= IO_NDELAY;		ioflag \|= IO_NDELAY;
if (fp->f_flag & O_DIRECT)		if (fp->f_flag & O_DIRECT)
ioflag \|= IO_DIRECT;		ioflag \|= IO_DIRECT;
advice = get_advice(fp, uio);		advice = get_advice(fp, uio);
vn_lock(vp, LK_SHARED \| LK_RETRY);		vn_lock(vp, LK_SHARED \| LK_RETRY);

▲ Show 20 Lines • Show All 288 Lines • ▼ Show 20 Lines	while (uio_clone->uio_resid != 0) {
len = uio_clone->uio_iov->iov_len;		len = uio_clone->uio_iov->iov_len;
if (len == 0) {		if (len == 0) {
KASSERT(uio_clone->uio_iovcnt >= 1,		KASSERT(uio_clone->uio_iovcnt >= 1,
("iovcnt underflow"));		("iovcnt underflow"));
uio_clone->uio_iov++;		uio_clone->uio_iov++;
uio_clone->uio_iovcnt--;		uio_clone->uio_iovcnt--;
continue;		continue;
}		}
if (len > io_hold_cnt * PAGE_SIZE)		if (len > ptoa(io_hold_cnt))
len = io_hold_cnt * PAGE_SIZE;		len = ptoa(io_hold_cnt);
addr = (uintptr_t)uio_clone->uio_iov->iov_base;		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
end = round_page(addr + len);		end = round_page(addr + len);
if (end < addr) {		if (end < addr) {
error = EFAULT;		error = EFAULT;
break;		break;
}		}
cnt = atop(end - trunc_page(addr));		cnt = atop(end - trunc_page(addr));
/*		/*
▲ Show 20 Lines • Show All 1,986 Lines • Show Last 20 Lines