Changeset View
Standalone View
sys/kern/vfs_vnops.c
Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines | |||||
#include <sys/kdb.h> | #include <sys/kdb.h> | ||||
#include <sys/ktr.h> | #include <sys/ktr.h> | ||||
#include <sys/stat.h> | #include <sys/stat.h> | ||||
#include <sys/priv.h> | #include <sys/priv.h> | ||||
#include <sys/proc.h> | #include <sys/proc.h> | ||||
#include <sys/limits.h> | #include <sys/limits.h> | ||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||
#include <sys/mman.h> | #include <sys/mman.h> | ||||
#include <sys/malloc.h> | |||||
#include <sys/mount.h> | #include <sys/mount.h> | ||||
#include <sys/mutex.h> | #include <sys/mutex.h> | ||||
#include <sys/namei.h> | #include <sys/namei.h> | ||||
#include <sys/vnode.h> | #include <sys/vnode.h> | ||||
#include <sys/bio.h> | #include <sys/bio.h> | ||||
#include <sys/buf.h> | #include <sys/buf.h> | ||||
#include <sys/filio.h> | #include <sys/filio.h> | ||||
#include <sys/resourcevar.h> | #include <sys/resourcevar.h> | ||||
Show All 12 Lines | |||||
#include <vm/vm.h> | #include <vm/vm.h> | ||||
#include <vm/vm_extern.h> | #include <vm/vm_extern.h> | ||||
#include <vm/pmap.h> | #include <vm/pmap.h> | ||||
#include <vm/vm_map.h> | #include <vm/vm_map.h> | ||||
#include <vm/vm_object.h> | #include <vm/vm_object.h> | ||||
#include <vm/vm_page.h> | #include <vm/vm_page.h> | ||||
#include <vm/vnode_pager.h> | #include <vm/vnode_pager.h> | ||||
#include <machine/vmparam.h> | |||||
#ifdef HWPMC_HOOKS | #ifdef HWPMC_HOOKS | ||||
#include <sys/pmckern.h> | #include <sys/pmckern.h> | ||||
#endif | #endif | ||||
static fo_rdwr_t vn_read; | static fo_rdwr_t vn_read; | ||||
static fo_rdwr_t vn_write; | static fo_rdwr_t vn_write; | ||||
static fo_rdwr_t vn_io_fault; | static fo_rdwr_t vn_io_fault; | ||||
static fo_truncate_t vn_truncate; | static fo_truncate_t vn_truncate; | ||||
▲ Show 20 Lines • Show All 2,387 Lines • ▼ Show 20 Lines | |||||
vn_fsid(struct vnode *vp, struct vattr *va) | vn_fsid(struct vnode *vp, struct vattr *va) | ||||
{ | { | ||||
fsid_t *f; | fsid_t *f; | ||||
f = &vp->v_mount->mnt_stat.f_fsid; | f = &vp->v_mount->mnt_stat.f_fsid; | ||||
va->va_fsid = (uint32_t)f->val[1]; | va->va_fsid = (uint32_t)f->val[1]; | ||||
va->va_fsid <<= sizeof(f->val[1]) * NBBY; | va->va_fsid <<= sizeof(f->val[1]) * NBBY; | ||||
va->va_fsid += (uint32_t)f->val[0]; | va->va_fsid += (uint32_t)f->val[0]; | ||||
} | |||||
/* | |||||
* Test len bytes of data starting at addr for all bytes == 0. | |||||
asomers: You can avoid the extra malloc by using the globally defined `zero_region` (see vm/vm_kern.c). | |||||
Done Inline ActionsI did a simple C version using a u_int *. Maybe it would be more efficient to use rmacklem: I did a simple C version using a u_int *. Maybe it would be more efficient to use
a uint64_t *? | |||||
* Return true if all bytes are zero, false otherwise. | |||||
* Expects dat to be well aligned. | |||||
*/ | |||||
static bool | |||||
mem_iszero(void *dat, int len) | |||||
{ | |||||
int i; | |||||
const u_int *p; | |||||
const char *cp; | |||||
for (p = (const u_int *)dat; len > 0; len -= sizeof(*p), p++) { | |||||
if (len >= sizeof(*p)) { | |||||
if (*p != 0) | |||||
return (false); | |||||
} else { | |||||
cp = (const char *)p; | |||||
for (i = 0; i < len; i++, cp++) | |||||
if (*cp != '\0') | |||||
return (false); | |||||
} | |||||
} | |||||
return (true); | |||||
} | |||||
Done Inline Actionsoff_t is signed type, so the overflow is undefined, and you cannot check for it by testing the wrap. We do specify -fwrapv to compiler, but there is a desire to not add new code which depends on undefined behavior, since the fight with compiler will be lost anyway. kib: off_t is signed type, so the overflow is undefined, and you cannot check for it by testing the… | |||||
Done Inline ActionsIn the code below, I used two uint64_t variables to do the overflow checks. I suppose I could define the variables as uoff_t, but I don't know if that will rmacklem: In the code below, I used two uint64_t variables to do the overflow checks.
I think I got the… | |||||
/* | |||||
Not Done Inline ActionsYou may want to allow using VCHR. asomers: You may want to allow using `VCHR`. | |||||
Done Inline ActionsOnly defined for VREG on Linux and I'm not sure why you'd want to do this for VCHR? rmacklem: Only defined for VREG on Linux and I'm not sure why you'd want to do this for VCHR? | |||||
* Write an xfer sized chunk to outvp in blksize blocks from dat. | |||||
* dat is a maximum of blksize in length and can be written repeatedly in | |||||
* the chunk. | |||||
*/ | |||||
static int | |||||
vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, size_t xfer, | |||||
Done Inline ActionsThe invp == outvp error case should return EBADF, not EINVAL. asomers: The `invp == outvp` error case should return `EBADF`, not `EINVAL`. | |||||
Done Inline ActionsThe syscall actually got this right and I don't even recall what the correct NFSv4.2 rmacklem: The syscall actually got this right and I don't even recall what the correct NFSv4.2
error is… | |||||
u_long blksize) | |||||
{ | |||||
struct mount *mp; | |||||
size_t xfer2; | |||||
int error, lckf; | |||||
mp = NULL; | |||||
error = vn_start_write(outvp, &mp, V_WAIT); | |||||
if (error == 0) { | |||||
if (MNT_SHARED_WRITES(mp)) | |||||
lckf = LK_SHARED; | |||||
else | |||||
lckf = LK_EXCLUSIVE; | |||||
error = vn_lock(outvp, lckf); | |||||
} | |||||
if (error == 0) { | |||||
do { | |||||
xfer2 = xfer; | |||||
if (xfer2 > blksize) | |||||
xfer2 = blksize; | |||||
error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2, outoff, | |||||
UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, | |||||
NULL, NULL, curthread); | |||||
Done Inline ActionsYou can avoid the malloc and VFS_STATFS by just using the cached value: invp->v_mount->mnt_stat.f_iosize. asomers: You can avoid the malloc and `VFS_STATFS` by just using the cached value: `invp->v_mount… | |||||
outoff += xfer2; | |||||
xfer -= xfer2; | |||||
} while (xfer > 0 && error == 0); | |||||
VOP_UNLOCK(outvp, 0); | |||||
} | |||||
if (mp != NULL) | |||||
vn_finished_write(mp); | |||||
return (error); | |||||
} | |||||
int | |||||
Done Inline ActionsI really think you should use the least common multiple of in and out block sizes. kib: I really think you should use the least common multiple of in and out block sizes. | |||||
Done Inline ActionsMake that the LCM of in, out, and 16384. asomers: Make that the LCM of in, out, and 16384. | |||||
Done Inline ActionsBasically done. It now uses holesize if it can get that and falls back on f_iosize. rmacklem: Basically done. It now uses holesize if it can get that and falls back on f_iosize.
Assuming… | |||||
vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, | |||||
off_t *outoffp, size_t *lenp, unsigned int flags) | |||||
{ | |||||
struct vattr va; | |||||
Done Inline ActionsThe wait in vn_start_write() is after the vnode lock in global order. You are introducing the deadlock there. kib: The wait in vn_start_write() is after the vnode lock in global order. You are introducing the… | |||||
Done Inline ActionsOops, I've made this mistake before. rmacklem: Oops, I've made this mistake before.
| |||||
struct mount *mp; | |||||
off_t startoff, endoff; | |||||
u_long blksize; | |||||
int error; | |||||
bool cantseek, readzeros; | |||||
ssize_t aresid; | |||||
Done Inline ActionsDo not use malloc(M_WAITOK) while holding vnode lock. Pagedaemon might need to page out a page belonging to the vnode locked by current thread.. kib: Do not use malloc(M_WAITOK) while holding vnode lock. Pagedaemon might need to page out a page… | |||||
size_t copylen, len, savlen, xfer, xfer2; | |||||
char *dat; | |||||
uint64_t uvalin, uvalout; | |||||
long holein, holeout; | |||||
void *rl_rcookie, *rl_wcookie; | |||||
struct thread *td = curthread; | |||||
Done Inline Actionsglibc handles this simply: the fallback is implemented in userspace instead of kernelspace. asomers: glibc handles this simply: the fallback is implemented in userspace instead of kernelspace. | |||||
Done Inline ActionsYep. Although the Linux man page claims this is done in the kernel to reduce the I think doing it across file systems in the kernel makes sense. I wanted the NFS server to do it across multiple exported file systems, so that it rmacklem: Yep. Although the Linux man page claims this is done in the kernel to reduce the
number of… | |||||
savlen = len = *lenp; | |||||
*lenp = 0; /* Return 0 len for errors. */ | |||||
error = 0; | |||||
dat = NULL; | |||||
rl_rcookie = rl_wcookie = NULL; | |||||
/* Do some sanity checks on the arguments. */ | |||||
uvalin = *inoffp; | |||||
uvalin += len; | |||||
uvalout = *outoffp; | |||||
uvalout += len; | |||||
if (invp->v_type == VDIR || outvp->v_type == VDIR) | |||||
error = EISDIR; | |||||
Done Inline ActionsEither this or next vn_rdwr() calls are done in the way which skips range locking. The result is that torn writes can be seen. FreeBSD makes the guarantee that each single write(2) syscall effects are either seen as a whole, or not seen at all, by each single read(2) syscall. kib: Either this or next vn_rdwr() calls are done in the way which skips range locking. The result… | |||||
Done Inline ActionsI'll see what others say about this. rmacklem: I'll see what others say about this.
If they want atomicity, then we need to create non… | |||||
else if (*inoffp < 0 || uvalin > INT64_MAX || uvalin < | |||||
(uint64_t)*inoffp || *outoffp < 0 || uvalout > INT64_MAX || | |||||
uvalout < (uint64_t)*outoffp || invp->v_type != VREG || | |||||
outvp->v_type != VREG || invp == outvp) | |||||
error = EINVAL; | |||||
if (error != 0) | |||||
goto out; | |||||
error = vn_lock(invp, LK_SHARED); | |||||
if (error != 0) | |||||
goto out; | |||||
if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) | |||||
holein = 0; | |||||
Done Inline ActionsThis code looks like it ignores holes altogether. If the destination file contains a non-hole and the source file contains a hole, then you need to either hole-punch the destination or at least fill it with zeros. asomers: This code looks like it ignores holes altogether. If the destination file contains a non-hole… | |||||
Done Inline ActionsThanks, good catch. I was thinking of/testing with an empty output file. The code now truncates the output file if the range covers the output file rmacklem: Thanks, good catch. I was thinking of/testing with an empty output file.
The code now… | |||||
/* Check that the offset + len does not go past EOF of invp. */ | |||||
if (error == 0) | |||||
error = VOP_GETATTR(invp, &va, td->td_ucred); | |||||
if (error == 0 && va.va_size < (*inoffp + len)) | |||||
error = EINVAL; | |||||
VOP_UNLOCK(invp, 0); | |||||
if (error != 0) | |||||
goto out; | |||||
/* Range lock the byte ranges for both invp and outvp. */ | |||||
for (;;) { | |||||
rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp + | |||||
len); | |||||
rl_rcookie = vn_rangelock_rlock_trylock(invp, *inoffp, | |||||
*inoffp + len); | |||||
if (rl_rcookie != NULL) | |||||
break; | |||||
vn_rangelock_unlock(outvp, rl_wcookie); | |||||
rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len); | |||||
vn_rangelock_unlock(invp, rl_rcookie); | |||||
} | |||||
/* | |||||
* If the two vnodes are for the same file system, try the | |||||
* VOP_COPY_FILE_RANGE() call first, but do it here if the VOP | |||||
* call fails. | |||||
*/ | |||||
if (invp->v_mount == outvp->v_mount) { | |||||
error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp, | |||||
Done Inline ActionsThis ignores errors that it shouldn't, like EIO. It should return for any error except EOPNOTSUPP. Better yet, the fallback logic should be moved into vop_stdcopy_file_range and that should be the default method for this vop. asomers: This ignores errors that it shouldn't, like EIO. It should return for any error except… | |||||
Done Inline ActionsWell, I am not sure which errors are going to be recoverable by doing vn_rdwr() in a loop. I have no idea what other errors a non-FreeBSD NFSv4.2 server might return which is If you think fuse (or some future local fs that chooses to implement the VOP) will I don't see putting the "fallback" in the VOP, since it is designed to work across I think working across multiple file systems could be a useful feature. rmacklem: Well, I am not sure which errors are going to be recoverable by doing vn_rdwr() in a loop.
One… | |||||
Done Inline ActionsActually any error VOP_COPY_FILE_RANGE() returns gets returned by the syscall. rmacklem: Actually any error VOP_COPY_FILE_RANGE() returns gets returned by the syscall.
The current code… | |||||
Done Inline ActionsThis ignores errors that it shouldn't, like EIO. It should return for any error except EOPNOTSUPP. Better yet, the fallback logic should be moved into vop_stdcopy_file_range and that should be the default method for this vop. asomers: This ignores errors that it shouldn't, like `EIO`. It should return for any error except… | |||||
lenp, flags); | |||||
if (error == 0) | |||||
goto out; | |||||
} | |||||
mp = NULL; | |||||
error = vn_start_write(outvp, &mp, V_WAIT); | |||||
if (error == 0) | |||||
error = vn_lock(outvp, LK_EXCLUSIVE); | |||||
if (error == 0) { | |||||
if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) | |||||
holeout = 0; | |||||
/* | |||||
* Holes that are past EOF do not need to be written as a block | |||||
* of zero bytes. So, truncate the output file as far as | |||||
* possible and then use va.va_size to decide if writing 0 | |||||
* bytes is necessary in the loop below. | |||||
*/ | |||||
error = VOP_GETATTR(outvp, &va, td->td_ucred); | |||||
if (error == 0 && va.va_size > *outoffp && va.va_size <= | |||||
*outoffp + len) { | |||||
VATTR_NULL(&va); | |||||
va.va_size = *outoffp; | |||||
error = VOP_SETATTR(outvp, &va, td->td_ucred); | |||||
} | |||||
VOP_UNLOCK(outvp, 0); | |||||
} | |||||
if (mp != NULL) | |||||
vn_finished_write(mp); | |||||
if (error != 0) | |||||
goto out; | |||||
/* | |||||
* Set the blksize to the larger of the hole sizes for invp and outvp. | |||||
* If hole sizes aren't available, set the blksize to the larger | |||||
* f_iosize of invp and outvp. | |||||
* This code expects the hole sizes and f_iosizes to be powers of 2. | |||||
* This value is clipped at 4Kbytes and 1Mbyte. | |||||
*/ | |||||
if (holein > 0 && holeout > 0) | |||||
if (holein > holeout) | |||||
blksize = holein; | |||||
else | |||||
blksize = holeout; | |||||
else if (invp->v_mount->mnt_stat.f_iosize > | |||||
outvp->v_mount->mnt_stat.f_iosize) | |||||
blksize = invp->v_mount->mnt_stat.f_iosize; | |||||
else | |||||
blksize = outvp->v_mount->mnt_stat.f_iosize; | |||||
if (blksize < 4096) | |||||
blksize = 4096; | |||||
else if (blksize > 1048576) | |||||
blksize = 1048576; | |||||
dat = malloc(blksize, M_TEMP, M_WAITOK); | |||||
/* | |||||
* If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA | |||||
* to find holes. Otherwise, just scan the read block for all 0s | |||||
* in the inner loop where the data copying is done. | |||||
* Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may | |||||
* support holes on the server, but do not support FIOSEEKHOLE. | |||||
*/ | |||||
while (len > 0 && error == 0) { | |||||
endoff = 0; /* To shut up compilers. */ | |||||
cantseek = true; | |||||
startoff = *inoffp; | |||||
copylen = len; | |||||
readzeros = false; | |||||
/* | |||||
* Find the next data area. If there is just a hole to EOF, | |||||
* FIOSEEKDATA should fail and then we drop down into the | |||||
* inner loop and create the hole on the outvp file. | |||||
* (I do not know if any file system will report a hole to | |||||
* EOF via FOISEEKHOLE, but I am pretty sure FIOSEEKDATA | |||||
* will fail for those file systems.) | |||||
* | |||||
* For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, | |||||
* the code just falls through to the inner copy loop. | |||||
*/ | |||||
error = EINVAL; | |||||
if (holein > 0) | |||||
error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, | |||||
td->td_ucred, td); | |||||
if (error == 0) { | |||||
endoff = startoff; | |||||
error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, | |||||
td->td_ucred, td); | |||||
/* | |||||
* If the hole extends to the end of the byte range | |||||
* being copied, set error so that a copy of the | |||||
* last bytes (all 0s) will be done in the loop | |||||
* below, to ensure a hole is created in the output | |||||
* file. | |||||
* Also, set readzeros = true, since reads would return | |||||
* all 0 bytes. | |||||
*/ | |||||
if (startoff >= *inoffp + len) { | |||||
readzeros = true; | |||||
error = EINVAL; | |||||
} | |||||
} | |||||
if (error == 0) { | |||||
if (startoff > *inoffp) { | |||||
/* Found hole before data block. */ | |||||
xfer = startoff - *inoffp; | |||||
if (*inoffp < va.va_size) { | |||||
/* Must write 0s to punch hole. */ | |||||
xfer2 = va.va_size - *inoffp; | |||||
if (xfer2 > xfer) | |||||
xfer2 = xfer; | |||||
if (xfer2 > blksize) | |||||
memset(dat, 0, blksize); | |||||
else | |||||
memset(dat, 0, xfer2); | |||||
error = vn_write_outvp(outvp, dat, | |||||
*outoffp, xfer2, blksize); | |||||
} | |||||
if (error == 0) { | |||||
*inoffp += xfer; | |||||
*outoffp += xfer; | |||||
len -= xfer; | |||||
} | |||||
} | |||||
copylen = len; | |||||
if (copylen > endoff - startoff) | |||||
copylen = endoff - startoff; | |||||
cantseek = false; | |||||
} else { | |||||
cantseek = true; | |||||
startoff = *inoffp; | |||||
copylen = len; | |||||
error = 0; | |||||
} | |||||
xfer = blksize; | |||||
if (cantseek) { | |||||
/* | |||||
* Set first xfer to end at a block boundary, so that | |||||
* holes are more likely detected in the loop below via | |||||
* the for all bytes 0 method. | |||||
*/ | |||||
xfer -= (*inoffp % blksize); | |||||
} | |||||
if (readzeros) | |||||
memset(dat, 0, blksize); | |||||
/* Loop copying the data block. */ | |||||
while (copylen > 0 && error == 0) { | |||||
if (copylen < xfer) | |||||
xfer = copylen; | |||||
if (!readzeros) { | |||||
error = vn_lock(invp, LK_SHARED); | |||||
if (error != 0) | |||||
goto out; | |||||
error = vn_rdwr(UIO_READ, invp, dat, xfer, | |||||
startoff, UIO_SYSSPACE, IO_NODELOCKED, | |||||
td->td_ucred, NULL, &aresid, td); | |||||
VOP_UNLOCK(invp, 0); | |||||
/* | |||||
* Linux considers a range that exceeds EOF to | |||||
* be an error, so we will too. | |||||
*/ | |||||
if (error == 0 && aresid > 0) | |||||
error = EINVAL; | |||||
} | |||||
if (error == 0) { | |||||
/* | |||||
* Skip the write for holes past the initial EOF | |||||
* of the output file, unless this is the last | |||||
* write of the output file at EOF. | |||||
*/ | |||||
if (!cantseek || *outoffp < va.va_size || | |||||
xfer == len || !(readzeros || | |||||
mem_iszero(dat, xfer))) | |||||
error = vn_write_outvp(outvp, dat, | |||||
*outoffp, xfer, blksize); | |||||
if (error == 0) { | |||||
*inoffp += xfer; | |||||
startoff += xfer; | |||||
*outoffp += xfer; | |||||
copylen -= xfer; | |||||
len -= xfer; | |||||
} | |||||
} | |||||
xfer = blksize; | |||||
} | |||||
} | |||||
if (error == 0) | |||||
*lenp = savlen - len; | |||||
out: | |||||
if (rl_rcookie != NULL) | |||||
vn_rangelock_unlock(invp, rl_rcookie); | |||||
if (rl_wcookie != NULL) | |||||
vn_rangelock_unlock(outvp, rl_wcookie); | |||||
free(dat, M_TEMP); | |||||
return (error); | |||||
} | } | ||||
int | int | ||||
vn_fsync_buf(struct vnode *vp, int waitfor) | vn_fsync_buf(struct vnode *vp, int waitfor) | ||||
{ | { | ||||
struct buf *bp, *nbp; | struct buf *bp, *nbp; | ||||
struct bufobj *bo; | struct bufobj *bo; | ||||
struct mount *mp; | struct mount *mp; | ||||
▲ Show 20 Lines • Show All 88 Lines • Show Last 20 Lines |
You can avoid the extra malloc by using the globally defined zero_region (see vm/vm_kern.c). But comparing against that would still needlessly consume CPU cache. Better to write a mem_iszero helper that compares a single buffer against the literal 0.