Changeset View
Standalone View
sys/kern/vfs_syscalls.c
Show First 20 Lines • Show All 4,808 Lines • ▼ Show 20 Lines | |||||
sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) | sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) | ||||
{ | { | ||||
int error; | int error; | ||||
error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, | error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, | ||||
uap->advice); | uap->advice); | ||||
return (kern_posix_error(td, error)); | return (kern_posix_error(td, error)); | ||||
} | } | ||||
int | |||||
kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, | |||||
off_t *outoffp, size_t len, unsigned int flags) | |||||
{ | |||||
struct file *infp, *outfp; | |||||
struct vnode *invp, *outvp; | |||||
int error; | |||||
size_t retlen; | |||||
infp = outfp = NULL; | |||||
retlen = 0; | |||||
printf("copyf len=%d\n", len); | |||||
/* Get the file structures for the file descriptors. */ | |||||
error = fget_read(td, infd, &cap_read_rights, &infp); | |||||
if (error != 0) | |||||
goto out; | |||||
error = fget_write(td, outfd, &cap_write_rights, &outfp); | |||||
if (error != 0) | |||||
goto out; | |||||
printf("inoffp=%p outoffp=%p\n", inoffp, outoffp); | |||||
/* Set the offset pointers to the correct place. */ | |||||
if (inoffp == NULL) | |||||
inoffp = &infp->f_offset; | |||||
if (outoffp == NULL) | |||||
outoffp = &outfp->f_offset; | |||||
printf("inoff=%qd outoff=%qd\n", *inoffp, *outoffp); | |||||
kib: You need a bwillwrite() call somewhere.
Due to the nature of the operation, I suspect that you… | |||||
Done Inline ActionsThis variant of the patch does a bwillwrite() before every "blksize" write, so hopefully rmacklem: This variant of the patch does a bwillwrite() before every "blksize" write, so hopefully
this… | |||||
invp = infp->f_vnode; | |||||
outvp = outfp->f_vnode; | |||||
Done Inline ActionsThe invp == outvp error case isn't mentioned in the man page. asomers: The `invp == outvp` error case isn't mentioned in the man page. | |||||
/* Sanity check the f_flag bits. */ | |||||
if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || | |||||
(infp->f_flag & FREAD) == 0 || invp == outvp) { | |||||
error = EBADF; | |||||
goto out; | |||||
} | |||||
printf("at vop\n"); | |||||
retlen = len; | |||||
error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, | |||||
flags); | |||||
printf("aft vop=%d retlen=%d\n", error, retlen); | |||||
out: | |||||
Done Inline ActionsYou should not lock two vnodes simultaneously. We only allow this for the case where the first vnode is a directory, and second vnode is the file linked in the directory. Imagine two thread doing copy_file_range(fd1, fd2) and copy_file_range(fd2, fd1): you get user-controllable LOR. kib: You should not lock two vnodes simultaneously. We only allow this for the case where the first… | |||||
Not Done Inline ActionsWhat's the alternative? Is there a trylock variant that would work instead? Or must the entire mountpoint be locked? asomers: What's the alternative? Is there a trylock variant that would work instead? Or must the… | |||||
Not Done Inline Actionsfor (;;) { VOP_LOCK(vp1, LK_EXCLUSIVE); if (VOP_LOCK(vp2, LK_NOWAIT | LK_EXCLUSIVE) != 0) { VOP_UNLOCK(vp1, 0); VOP_LOCK(vp2, LK_EXCLUSIVE); VOP_UNLOCK(vp2); } else { break; } } Checks for doomed vnodes avoided for clarity. kib: ```
for (;;) {
VOP_LOCK(vp1, LK_EXCLUSIVE);
if (VOP_LOCK(vp2, LK_NOWAIT | LK_EXCLUSIVE) !=… | |||||
Done Inline ActionsHmm, after seeing your comment w.r.t. not locking two vnodes simultaneously, The disadvantage I can see w.r.t. the above is that other threads can write to either So, should I lock them both using an algorithm like kib@'s to avoid deadlock rmacklem: Hmm, after seeing your comment w.r.t. not locking two vnodes simultaneously,
I was going to… | |||||
Done Inline ActionsIt is up to you, but there is still one fundamental detail that you seems to miss. I already pointed out in other response that your patch lack range locks for both reads and writes. FreeBSD has separate locks for vnode (and usuall inode) structures AKA vnode lock, and lock for user data content AKA range lock. For large enough read(2) or write(2), vnode lock could be dropped after partial op progress, and to guarantee that no torn writes are ever seen, we lock range lock for corresponding vnode for the whole range which is read or written. I think that it does not matter much for correctness whether you take two locks simultaneously or serially, but if you take them simultaneously other things becomes quite tricky. E.g. since you are writing, vn_start_write() must be done, and it cannot be done in blocking manner while a vnode lock is held. Also, you should not sleep for other mp vnode lock while owning start-write reference on your mp. Similarly, rangelocks nesting should be handled, and there is no trylock op for them yet. kib: It is up to you, but there is still one fundamental detail that you seems to miss. I already… | |||||
Done Inline ActionsI'll wait for others to respond on FreeBSD-fs@ to see if atomicity is needed. rmacklem: I'll wait for others to respond on FreeBSD-fs@ to see if atomicity is needed.
Until then, there… | |||||
Done Inline ActionsThe current patch does rangelock locks on both files, using the rmacklem: The current patch does rangelock locks on both files, using the
rangelock_rlock_trylock() call… | |||||
if (outfp != NULL) | |||||
fdrop(outfp, td); | |||||
Not Done Inline Actionss/outvp/invp/ asomers: s/outvp/invp/ | |||||
if (infp != NULL) | |||||
fdrop(infp, td); | |||||
td->td_retval[0] = retlen; | |||||
printf("eo copyf\n"); | |||||
return (error); | |||||
} | |||||
int | |||||
Done Inline ActionsBest to unlock in LIFO order. asomers: Best to unlock in LIFO order. | |||||
sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) | |||||
{ | |||||
Done Inline ActionsIt probably doesn't matter, but I would fdrop in LIFO order too. asomers: It probably doesn't matter, but I would `fdrop` in LIFO order too. | |||||
off_t inoff, outoff, *inoffp, *outoffp; | |||||
int error; | |||||
inoffp = outoffp = NULL; | |||||
if (uap->inoffp != NULL) { | |||||
error = copyin(uap->inoffp, &inoff, sizeof(off_t)); | |||||
if (error != 0) | |||||
return (error); | |||||
inoffp = &inoff; | |||||
} | |||||
if (uap->outoffp != NULL) { | |||||
error = copyin(uap->outoffp, &outoff, sizeof(off_t)); | |||||
if (error != 0) | |||||
return (error); | |||||
outoffp = &outoff; | |||||
} | |||||
error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, | |||||
outoffp, uap->len, uap->flags); | |||||
if (error == 0 && uap->inoffp != NULL) | |||||
error = copyout(inoffp, uap->inoffp, sizeof(off_t)); | |||||
if (error == 0 && uap->outoffp != NULL) | |||||
error = copyout(outoffp, uap->outoffp, sizeof(off_t)); | |||||
return (error); | |||||
} |
You need a bwillwrite() call somewhere.
Due to the nature of the operation, I suspect that you need to insert the calls to bwillwrite() somewhere inside the copy loop, at the moment where no resources like vnode locks or started writes are held.