Differential D20584 Diff 58464 sys/kern/vfs_syscalls.c

Changeset View

Standalone View

sys/kern/vfs_syscalls.c

	Show First 20 Lines • Show All 4,808 Lines • ▼ Show 20 Lines
	sys_posix_fadvise(struct thread td, struct posix_fadvise_args uap)			sys_posix_fadvise(struct thread td, struct posix_fadvise_args uap)
	{			{
	int error;			int error;

	error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,			error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
	uap->advice);			uap->advice);
	return (kern_posix_error(td, error));			return (kern_posix_error(td, error));
	}			}

				int
				kern_copy_file_range(struct thread td, int infd, off_t inoffp, int outfd,
				off_t *outoffp, size_t len, u_int flags)
				{
				struct file infp, outfp;
				struct vnode invp, outvp;
				int error, lock_flags;
				size_t retlen;

				infp = outfp = NULL;
				retlen = 0;

				/* Get the file structures for the file descriptors. */
				error = fget_read(td, infd, &cap_read_rights, &infp);
				if (error != 0)
				goto out;
				error = fget_write(td, outfd, &cap_write_rights, &outfp);
				if (error != 0)
				goto out;

				/* Set the offset pointers to the correct place. */
				if (inoffp == NULL)
				inoffp = &infp->f_offset;
				if (outoffp == NULL)
				outoffp = &outfp->f_offset;

				/* Sanity check the f_flag bits. */
				if ((outfp->f_flag & (FWRITE \| FAPPEND)) != FWRITE \|\|
				(infp->f_flag & FREAD) == 0) {
				kibUnsubmitted Done Inline Actions You need a bwillwrite() call somewhere. Due to the nature of the operation, I suspect that you need to insert the calls to bwillwrite() somewhere inside the copy loop, at the moment where no resources like vnode locks or started writes are held. kib: You need a bwillwrite() call somewhere. Due to the nature of the operation, I suspect that you…
				rmacklemAuthorUnsubmitted Done Inline Actions This variant of the patch does a bwillwrite() before every "blksize" write, so hopefully this will avoid a large copy from flooding the buffer cache. rmacklem: This variant of the patch does a bwillwrite() before every "blksize" write, so hopefully this…
				error = EBADF;
				goto out;
				asomersUnsubmitted Done Inline Actions The `invp == outvp` error case isn't mentioned in the man page. asomers: The `invp == outvp` error case isn't mentioned in the man page.
				}

				/* Lock the vnodes. */
				invp = infp->f_vnode;
				error = vn_lock(invp, LK_SHARED);
				if (error != 0)
				goto out;
				outvp = outfp->f_vnode;
				if (MNT_SHARED_WRITES(outvp->v_mount))
				lock_flags = LK_SHARED;
				else
				lock_flags = LK_EXCLUSIVE;
				error = vn_lock(outvp, lock_flags);
				kibUnsubmitted Done Inline Actions You should not lock two vnodes simultaneously. We only allow this for the case where the first vnode is a directory, and second vnode is the file linked in the directory. Imagine two thread doing copy_file_range(fd1, fd2) and copy_file_range(fd2, fd1): you get user-controllable LOR. kib: You should not lock two vnodes simultaneously. We only allow this for the case where the first…
				asomersUnsubmitted Not Done Inline Actions What's the alternative? Is there a trylock variant that would work instead? Or must the entire mountpoint be locked? asomers: What's the alternative? Is there a trylock variant that would work instead? Or must the…
				kibUnsubmitted Not Done Inline Actions for (;;) { VOP_LOCK(vp1, LK_EXCLUSIVE); if (VOP_LOCK(vp2, LK_NOWAIT \| LK_EXCLUSIVE) != 0) { VOP_UNLOCK(vp1, 0); VOP_LOCK(vp2, LK_EXCLUSIVE); VOP_UNLOCK(vp2); } else { break; } } Checks for doomed vnodes avoided for clarity. kib: ``` for (;;) { VOP_LOCK(vp1, LK_EXCLUSIVE); if (VOP_LOCK(vp2, LK_NOWAIT \| LK_EXCLUSIVE) !=…
				rmacklemAuthorUnsubmitted Done Inline Actions Hmm, after seeing your comment w.r.t. not locking two vnodes simultaneously, I was going to change the code to lock/unlock them both for each read/write. (Basically let vn_rdwr() do the locking/unlocking.) The advantage of this is I could use VOP_IOCTL() to look for holes instead of comparing the read bytes for all zeros. The disadvantage I can see w.r.t. the above is that other threads can write to either file concurrently with the copy and produce unpredictable results. So, should I lock them both using an algorithm like kib@'s to avoid deadlock or should I do the above and unlock/lock the vnodes for every block read/written? rmacklem: Hmm, after seeing your comment w.r.t. not locking two vnodes simultaneously, I was going to…
				kibUnsubmitted Done Inline Actions It is up to you, but there is still one fundamental detail that you seems to miss. I already pointed out in other response that your patch lack range locks for both reads and writes. FreeBSD has separate locks for vnode (and usuall inode) structures AKA vnode lock, and lock for user data content AKA range lock. For large enough read(2) or write(2), vnode lock could be dropped after partial op progress, and to guarantee that no torn writes are ever seen, we lock range lock for corresponding vnode for the whole range which is read or written. I think that it does not matter much for correctness whether you take two locks simultaneously or serially, but if you take them simultaneously other things becomes quite tricky. E.g. since you are writing, vn_start_write() must be done, and it cannot be done in blocking manner while a vnode lock is held. Also, you should not sleep for other mp vnode lock while owning start-write reference on your mp. Similarly, rangelocks nesting should be handled, and there is no trylock op for them yet. kib: It is up to you, but there is still one fundamental detail that you seems to miss. I already…
				rmacklemAuthorUnsubmitted Done Inline Actions I'll wait for others to respond on FreeBSD-fs@ to see if atomicity is needed. Until then, there is no concurrent locking of both files in the patch. rmacklem: I'll wait for others to respond on FreeBSD-fs@ to see if atomicity is needed. Until then, there…
				rmacklemAuthorUnsubmitted Done Inline Actions The current patch does rangelock locks on both files, using the rangelock_rlock_trylock() call in D20645. This provides atomic updating of the file data. The vnodes are still locked/unlocked in the loops, so that VOP_IOCTL() with FIOSEEKDATA/FIOSEEKHOLE can be used to find holes. rmacklem: The current patch does rangelock locks on both files, using the rangelock_rlock_trylock() call…
				if (error != 0) {
				VOP_UNLOCK(outvp, 0);
				asomersUnsubmitted Not Done Inline Actions s/outvp/invp/ asomers: s/outvp/invp/
				goto out;
				}

				retlen = len;
				error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen,
				flags);
				VOP_UNLOCK(invp, 0);
				VOP_UNLOCK(outvp, 0);
				asomersUnsubmitted Done Inline Actions Best to unlock in LIFO order. asomers: Best to unlock in LIFO order.
				out:
				if (infp != NULL)
				asomersUnsubmitted Done Inline Actions It probably doesn't matter, but I would `fdrop` in LIFO order too. asomers: It probably doesn't matter, but I would `fdrop` in LIFO order too.
				fdrop(infp, td);
				if (outfp != NULL)
				fdrop(outfp, td);
				td->td_retval[0] = retlen;
				return (error);
				}

				int
				sys_copy_file_range(struct thread td, struct copy_file_range_args uap)
				{
				off_t inoff, outoff, inoffp, outoffp;
				int error;

				inoffp = outoffp = NULL;
				if (uap->inoffp != NULL) {
				error = copyin(uap->inoffp, &inoff, sizeof(off_t));
				if (error != 0)
				return (error);
				inoffp = &inoff;
				}
				if (uap->outoffp != NULL) {
				error = copyin(uap->outoffp, &outoff, sizeof(off_t));
				if (error != 0)
				return (error);
				outoffp = &outoff;
				}
				error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd,
				outoffp, uap->len, uap->flags);
				if (error == 0 && uap->inoffp != NULL)
				error = copyout(inoffp, uap->inoffp, sizeof(off_t));
				if (error == 0 && uap->outoffp != NULL)
				error = copyout(outoffp, uap->outoffp, sizeof(off_t));
				return (error);
				}