Changeset View
Standalone View
sys/kern/vfs_syscalls.c
Show First 20 Lines • Show All 4,808 Lines • ▼ Show 20 Lines | |||||
sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) | sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) | ||||
{ | { | ||||
int error; | int error; | ||||
error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, | error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, | ||||
uap->advice); | uap->advice); | ||||
return (kern_posix_error(td, error)); | return (kern_posix_error(td, error)); | ||||
} | } | ||||
int | |||||
kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, | |||||
off_t *outoffp, size_t len, u_int flags) | |||||
{ | |||||
struct file *infp, *outfp; | |||||
struct vnode *invp, *outvp; | |||||
int error, lock_flags; | |||||
size_t retlen; | |||||
infp = outfp = NULL; | |||||
retlen = 0; | |||||
/* Get the file structures for the file descriptors. */ | |||||
error = fget_read(td, infd, &cap_read_rights, &infp); | |||||
if (error != 0) | |||||
goto out; | |||||
error = fget_write(td, outfd, &cap_write_rights, &outfp); | |||||
if (error != 0) | |||||
goto out; | |||||
/* Set the offset pointers to the correct place. */ | |||||
if (inoffp == NULL) | |||||
inoffp = &infp->f_offset; | |||||
if (outoffp == NULL) | |||||
outoffp = &outfp->f_offset; | |||||
/* Sanity check the f_flag bits. */ | |||||
if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || | |||||
(infp->f_flag & FREAD) == 0) { | |||||
kib: You need a bwillwrite() call somewhere.
Due to the nature of the operation, I suspect that you… | |||||
Done Inline ActionsThis variant of the patch does a bwillwrite() before every "blksize" write, so hopefully rmacklem: This variant of the patch does a bwillwrite() before every "blksize" write, so hopefully
this… | |||||
error = EBADF; | |||||
goto out; | |||||
Done Inline ActionsThe invp == outvp error case isn't mentioned in the man page. asomers: The `invp == outvp` error case isn't mentioned in the man page. | |||||
} | |||||
/* Lock the vnodes. */ | |||||
invp = infp->f_vnode; | |||||
error = vn_lock(invp, LK_SHARED); | |||||
if (error != 0) | |||||
goto out; | |||||
outvp = outfp->f_vnode; | |||||
if (MNT_SHARED_WRITES(outvp->v_mount)) | |||||
lock_flags = LK_SHARED; | |||||
else | |||||
lock_flags = LK_EXCLUSIVE; | |||||
error = vn_lock(outvp, lock_flags); | |||||
kibUnsubmitted Done Inline ActionsYou should not lock two vnodes simultaneously. We only allow this for the case where the first vnode is a directory, and second vnode is the file linked in the directory. Imagine two thread doing copy_file_range(fd1, fd2) and copy_file_range(fd2, fd1): you get user-controllable LOR. kib: You should not lock two vnodes simultaneously. We only allow this for the case where the first… | |||||
asomersUnsubmitted Not Done Inline ActionsWhat's the alternative? Is there a trylock variant that would work instead? Or must the entire mountpoint be locked? asomers: What's the alternative? Is there a trylock variant that would work instead? Or must the… | |||||
kibUnsubmitted Not Done Inline Actionsfor (;;) { VOP_LOCK(vp1, LK_EXCLUSIVE); if (VOP_LOCK(vp2, LK_NOWAIT | LK_EXCLUSIVE) != 0) { VOP_UNLOCK(vp1, 0); VOP_LOCK(vp2, LK_EXCLUSIVE); VOP_UNLOCK(vp2); } else { break; } } Checks for doomed vnodes avoided for clarity. kib: ```
for (;;) {
VOP_LOCK(vp1, LK_EXCLUSIVE);
if (VOP_LOCK(vp2, LK_NOWAIT | LK_EXCLUSIVE) !=… | |||||
rmacklemAuthorUnsubmitted Done Inline ActionsHmm, after seeing your comment w.r.t. not locking two vnodes simultaneously, The disadvantage I can see w.r.t. the above is that other threads can write to either So, should I lock them both using an algorithm like kib@'s to avoid deadlock rmacklem: Hmm, after seeing your comment w.r.t. not locking two vnodes simultaneously,
I was going to… | |||||
kibUnsubmitted Done Inline ActionsIt is up to you, but there is still one fundamental detail that you seems to miss. I already pointed out in other response that your patch lack range locks for both reads and writes. FreeBSD has separate locks for vnode (and usuall inode) structures AKA vnode lock, and lock for user data content AKA range lock. For large enough read(2) or write(2), vnode lock could be dropped after partial op progress, and to guarantee that no torn writes are ever seen, we lock range lock for corresponding vnode for the whole range which is read or written. I think that it does not matter much for correctness whether you take two locks simultaneously or serially, but if you take them simultaneously other things becomes quite tricky. E.g. since you are writing, vn_start_write() must be done, and it cannot be done in blocking manner while a vnode lock is held. Also, you should not sleep for other mp vnode lock while owning start-write reference on your mp. Similarly, rangelocks nesting should be handled, and there is no trylock op for them yet. kib: It is up to you, but there is still one fundamental detail that you seems to miss. I already… | |||||
rmacklemAuthorUnsubmitted Done Inline ActionsI'll wait for others to respond on FreeBSD-fs@ to see if atomicity is needed. rmacklem: I'll wait for others to respond on FreeBSD-fs@ to see if atomicity is needed.
Until then, there… | |||||
rmacklemAuthorUnsubmitted Done Inline ActionsThe current patch does rangelock locks on both files, using the rmacklem: The current patch does rangelock locks on both files, using the
rangelock_rlock_trylock() call… | |||||
if (error != 0) { | |||||
VOP_UNLOCK(outvp, 0); | |||||
asomersUnsubmitted Not Done Inline Actionss/outvp/invp/ asomers: s/outvp/invp/ | |||||
goto out; | |||||
} | |||||
retlen = len; | |||||
error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, | |||||
flags); | |||||
VOP_UNLOCK(invp, 0); | |||||
VOP_UNLOCK(outvp, 0); | |||||
asomersUnsubmitted Done Inline ActionsBest to unlock in LIFO order. asomers: Best to unlock in LIFO order. | |||||
out: | |||||
if (infp != NULL) | |||||
asomersUnsubmitted Done Inline ActionsIt probably doesn't matter, but I would fdrop in LIFO order too. asomers: It probably doesn't matter, but I would `fdrop` in LIFO order too. | |||||
fdrop(infp, td); | |||||
if (outfp != NULL) | |||||
fdrop(outfp, td); | |||||
td->td_retval[0] = retlen; | |||||
return (error); | |||||
} | |||||
int | |||||
sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) | |||||
{ | |||||
off_t inoff, outoff, *inoffp, *outoffp; | |||||
int error; | |||||
inoffp = outoffp = NULL; | |||||
if (uap->inoffp != NULL) { | |||||
error = copyin(uap->inoffp, &inoff, sizeof(off_t)); | |||||
if (error != 0) | |||||
return (error); | |||||
inoffp = &inoff; | |||||
} | |||||
if (uap->outoffp != NULL) { | |||||
error = copyin(uap->outoffp, &outoff, sizeof(off_t)); | |||||
if (error != 0) | |||||
return (error); | |||||
outoffp = &outoff; | |||||
} | |||||
error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, | |||||
outoffp, uap->len, uap->flags); | |||||
if (error == 0 && uap->inoffp != NULL) | |||||
error = copyout(inoffp, uap->inoffp, sizeof(off_t)); | |||||
if (error == 0 && uap->outoffp != NULL) | |||||
error = copyout(outoffp, uap->outoffp, sizeof(off_t)); | |||||
return (error); | |||||
} |
You need a bwillwrite() call somewhere.
Due to the nature of the operation, I suspect that you need to insert the calls to bwillwrite() somewhere inside the copy loop, at the moment where no resources like vnode locks or started writes are held.