Changeset View
Standalone View
sys/kern/vfs_subr.c
Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | |||||
#include <sys/conf.h> | #include <sys/conf.h> | ||||
#include <sys/counter.h> | #include <sys/counter.h> | ||||
#include <sys/dirent.h> | #include <sys/dirent.h> | ||||
#include <sys/event.h> | #include <sys/event.h> | ||||
#include <sys/eventhandler.h> | #include <sys/eventhandler.h> | ||||
#include <sys/extattr.h> | #include <sys/extattr.h> | ||||
#include <sys/file.h> | #include <sys/file.h> | ||||
#include <sys/fcntl.h> | #include <sys/fcntl.h> | ||||
#include <sys/filio.h> | |||||
#include <sys/jail.h> | #include <sys/jail.h> | ||||
#include <sys/kdb.h> | #include <sys/kdb.h> | ||||
#include <sys/kernel.h> | #include <sys/kernel.h> | ||||
#include <sys/kthread.h> | #include <sys/kthread.h> | ||||
#include <sys/ktr.h> | #include <sys/ktr.h> | ||||
#include <sys/lockf.h> | #include <sys/lockf.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/mount.h> | #include <sys/mount.h> | ||||
#include <sys/namei.h> | #include <sys/namei.h> | ||||
#include <sys/pctrie.h> | #include <sys/pctrie.h> | ||||
#include <sys/priv.h> | #include <sys/priv.h> | ||||
#include <sys/reboot.h> | #include <sys/reboot.h> | ||||
#include <sys/refcount.h> | #include <sys/refcount.h> | ||||
#include <sys/rwlock.h> | #include <sys/rwlock.h> | ||||
#include <sys/sched.h> | #include <sys/sched.h> | ||||
#include <sys/sleepqueue.h> | #include <sys/sleepqueue.h> | ||||
#include <sys/smp.h> | #include <sys/smp.h> | ||||
#include <sys/stat.h> | #include <sys/stat.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <sys/syslog.h> | #include <sys/syslog.h> | ||||
#include <sys/unistd.h> | |||||
#include <sys/vmmeter.h> | #include <sys/vmmeter.h> | ||||
#include <sys/vnode.h> | #include <sys/vnode.h> | ||||
#include <sys/watchdog.h> | #include <sys/watchdog.h> | ||||
#include <machine/stdarg.h> | #include <machine/stdarg.h> | ||||
#include <security/mac/mac_framework.h> | #include <security/mac/mac_framework.h> | ||||
▲ Show 20 Lines • Show All 5,560 Lines • ▼ Show 20 Lines | __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) | ||||
if (*mvp == NULL) | if (*mvp == NULL) | ||||
return; | return; | ||||
mtx_lock(&mp->mnt_listmtx); | mtx_lock(&mp->mnt_listmtx); | ||||
TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); | TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); | ||||
mtx_unlock(&mp->mnt_listmtx); | mtx_unlock(&mp->mnt_listmtx); | ||||
mnt_vnode_markerfree_active(mvp, mp); | mnt_vnode_markerfree_active(mvp, mp); | ||||
} | |||||
/* | |||||
* Test len bytes of data starting at addr for all bytes == 0. | |||||
asomers: s/addr/dat/ | |||||
* Return true if all bytes are zero, false otherwise. | |||||
* Expects dat to be well aligned. | |||||
*/ | |||||
static bool | |||||
mem_iszero(void *dat, int len) | |||||
{ | |||||
int i; | |||||
const u_int *p; | |||||
const char *cp; | |||||
for (p = (const u_int *)dat; len > 0; len -= sizeof(*p), p++) { | |||||
Done Inline ActionsThe cast is excessive. kib: The cast is excessive. | |||||
if (len >= sizeof(*p)) { | |||||
if (*p != 0) | |||||
return (false); | |||||
} else { | |||||
cp = (const char *)p; | |||||
for (i = 0; i < len; i++, cp++) | |||||
if (*cp != '\0') | |||||
return (false); | |||||
} | |||||
} | |||||
return (true); | |||||
} | |||||
/* | |||||
* Write an xfer sized chunk to outvp in blksize blocks from dat. | |||||
* dat is a maximum of blksize in length and can be written repeatedly in | |||||
Not Done Inline ActionsShould this be "a minimum of blksize in length"? asomers: Should this be "a minimum of blksize in length"? | |||||
Done Inline ActionsNope. The call is often done with xfer < blksize. blksize is simply the size of rmacklem: Nope. The call is often done with xfer < blksize. blksize is simply the size of
the "dat"… | |||||
* the chunk. | |||||
*/ | |||||
static int | |||||
vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer, | |||||
u_long blksize, struct ucred *cred) | |||||
{ | |||||
struct mount *mp; | |||||
off_t xfer2; | |||||
int error, lckf; | |||||
mp = NULL; | |||||
error = vn_start_write(outvp, &mp, V_WAIT); | |||||
if (error == 0) { | |||||
if (MNT_SHARED_WRITES(mp)) | |||||
lckf = LK_SHARED; | |||||
else | |||||
lckf = LK_EXCLUSIVE; | |||||
error = vn_lock(outvp, lckf); | |||||
} | |||||
if (error == 0) { | |||||
do { | |||||
xfer2 = xfer; | |||||
if (xfer2 > blksize) | |||||
xfer2 = blksize; | |||||
error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2, outoff, | |||||
UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, | |||||
cred, NULL, curthread); | |||||
outoff += xfer2; | |||||
Done Inline ActionsThis would be more readable as xfer2 = MIN(xfer, blksize). asomers: This would be more readable as `xfer2 = MIN(xfer, blksize)`. | |||||
xfer -= xfer2; | |||||
} while (xfer > 0 && error == 0); | |||||
VOP_UNLOCK(outvp, 0); | |||||
} | |||||
if (mp != NULL) | |||||
vn_finished_write(mp); | |||||
return (error); | |||||
} | |||||
/* | |||||
* Copy a byte range of one file to another. This function can handle the | |||||
* case where invp and outvp are on different file systems. | |||||
* It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there | |||||
* is no better file system specific way to do it. | |||||
*/ | |||||
int | |||||
vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, | |||||
struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, | |||||
struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) | |||||
{ | |||||
struct vattr va; | |||||
struct mount *mp; | |||||
struct uio io; | |||||
off_t startoff, endoff, xfer, xfer2; | |||||
u_long blksize; | |||||
int error; | |||||
bool cantseek, readzeros; | |||||
ssize_t aresid; | |||||
size_t copylen, len, savlen; | |||||
Not Done Inline ActionsProper place for vn_ functions is vfs_vnops.c. kib: Proper place for vn_ functions is vfs_vnops.c. | |||||
char *dat; | |||||
long holein, holeout; | |||||
savlen = len = *lenp; | |||||
error = 0; | |||||
dat = NULL; | |||||
error = vn_lock(invp, LK_SHARED); | |||||
if (error != 0) | |||||
goto out; | |||||
if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) | |||||
holein = 0; | |||||
VOP_UNLOCK(invp, 0); | |||||
if (error != 0) | |||||
goto out; | |||||
mp = NULL; | |||||
Not Done Inline ActionsAccording to the man page, holein should be an int, not a long. But that looks like a bug in the man page. asomers: According to the man page, `holein` should be an `int`, not a `long`. But that looks like a… | |||||
Done Inline ActionsNever trust a man page. Always look at the code for the real documentation.;-) rmacklem: Never trust a man page. Always look at the code for the real documentation.;-) | |||||
error = vn_start_write(outvp, &mp, V_WAIT); | |||||
if (error == 0) | |||||
error = vn_lock(outvp, LK_EXCLUSIVE); | |||||
if (error == 0) { | |||||
/* | |||||
* If fsize_td != NULL, do a vn_rlimit_fsize() call, | |||||
* now that outvp is locked. | |||||
*/ | |||||
if (fsize_td != NULL) { | |||||
io.uio_offset = *outoffp; | |||||
io.uio_resid = len; | |||||
error = vn_rlimit_fsize(outvp, &io, fsize_td); | |||||
if (error != 0) | |||||
error = EFBIG; | |||||
} | |||||
if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) | |||||
holeout = 0; | |||||
/* | |||||
* Holes that are past EOF do not need to be written as a block | |||||
* of zero bytes. So, truncate the output file as far as | |||||
* possible and then use va.va_size to decide if writing 0 | |||||
* bytes is necessary in the loop below. | |||||
*/ | |||||
if (error == 0) | |||||
error = VOP_GETATTR(outvp, &va, outcred); | |||||
if (error == 0 && va.va_size > *outoffp && va.va_size <= | |||||
*outoffp + len) { | |||||
#ifdef MAC | |||||
error = mac_vnode_check_write(curthread->td_ucred, | |||||
outcred, outvp); | |||||
#endif | |||||
if (error == 0) | |||||
Not Done Inline ActionsI'm not following this logic. This says that if the current EOF lies in the range of bytes to be written, then first truncate the file down to the beginning of the range. Why? It seems that it would be useful, OTOH, to truncate the file up to the beginning of the range, if the current EOF lies below the start of the range. asomers: I'm not following this logic. This says that if the current EOF lies in the range of bytes to… | |||||
Done Inline ActionsFor a simple example, lets say the output file is 15bytes in size and the input The file is vn_truncate_locked()'d to grow it, if needed. (This only happens if there rmacklem: For a simple example, lets say the output file is 15bytes in size and the input
file has a hole… | |||||
Not Done Inline ActionsOk, I get it. Simply writing zeros would create a dense output file. To create a sparse output file, it's necessary to truncate down first. asomers: Ok, I get it. Simply writing zeros would create a dense output file. To create a sparse… | |||||
error = vn_truncate_locked(outvp, *outoffp, | |||||
false, outcred); | |||||
if (error == 0) | |||||
va.va_size = *outoffp; | |||||
} | |||||
Done Inline ActionsThe #endif can be moved below the error == 0 check. asomers: The `#endif` can be moved below the `error == 0` check. | |||||
VOP_UNLOCK(outvp, 0); | |||||
} | |||||
if (mp != NULL) | |||||
vn_finished_write(mp); | |||||
if (error != 0) | |||||
goto out; | |||||
/* | |||||
* Set the blksize to the larger of the hole sizes for invp and outvp. | |||||
* If hole sizes aren't available, set the blksize to the larger | |||||
* f_iosize of invp and outvp. | |||||
* This code expects the hole sizes and f_iosizes to be powers of 2. | |||||
* This value is clipped at 4Kbytes and 1Mbyte. | |||||
*/ | |||||
if (holein > 0 && holeout > 0) | |||||
if (holein > holeout) | |||||
blksize = holein; | |||||
else | |||||
blksize = holeout; | |||||
else if (invp->v_mount->mnt_stat.f_iosize > | |||||
outvp->v_mount->mnt_stat.f_iosize) | |||||
Done Inline ActionsThis would be more readable as blksize = MAX(holein, holeout); if (blksize == 0) blksize = MAX(invp->v_mount->mnt_stat.f_iosize, outvp->v_mount->mnt_stat.f_iosize) asomers: This would be more readable as
```
blksize = MAX(holein, holeout);
if (blksize == 0)… | |||||
blksize = invp->v_mount->mnt_stat.f_iosize; | |||||
else | |||||
blksize = outvp->v_mount->mnt_stat.f_iosize; | |||||
if (blksize < 4096) | |||||
blksize = 4096; | |||||
else if (blksize > 1048576) | |||||
blksize = 1048576; | |||||
dat = malloc(blksize, M_TEMP, M_WAITOK); | |||||
/* | |||||
* If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA | |||||
* to find holes. Otherwise, just scan the read block for all 0s | |||||
Not Done Inline ActionsWhat's the significance of 1048576? asomers: What's the significance of 1048576? | |||||
Done Inline ActionsNot much. I don't know of a "this is the biggest file system block size for all file systems" blksize is how much is malloc()d. I didn't want it too large. I would be comfortable with anything that is small enough to safely malloc() and 10Mbytes? 100Mbytes? rmacklem: Not much. I don't know of a "this is the biggest file system block size for all file systems"… | |||||
Not Done Inline ActionsI would write this as 1024 * 1024. kib: I would write this as 1024 * 1024. | |||||
* in the inner loop where the data copying is done. | |||||
* Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may | |||||
* support holes on the server, but do not support FIOSEEKHOLE. | |||||
*/ | |||||
while (len > 0 && error == 0) { | |||||
endoff = 0; /* To shut up compilers. */ | |||||
cantseek = true; | |||||
startoff = *inoffp; | |||||
copylen = len; | |||||
readzeros = false; | |||||
/* | |||||
* Find the next data area. If there is just a hole to EOF, | |||||
* FIOSEEKDATA should fail and then we drop down into the | |||||
* inner loop and create the hole on the outvp file. | |||||
* (I do not know if any file system will report a hole to | |||||
* EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA | |||||
* will fail for those file systems.) | |||||
* | |||||
* For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, | |||||
* the code just falls through to the inner copy loop. | |||||
*/ | |||||
error = EINVAL; | |||||
if (holein > 0) | |||||
error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, | |||||
incred, curthread); | |||||
if (error == 0) { | |||||
endoff = startoff; | |||||
error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, | |||||
incred, curthread); | |||||
/* | |||||
* If the hole extends to the end of the byte range | |||||
* being copied, set error so that a copy of the | |||||
* last bytes (all 0s) will be done in the loop | |||||
* below, to ensure a hole is created in the output | |||||
* file. | |||||
* Also, set readzeros = true, since reads would return | |||||
* all 0 bytes. | |||||
*/ | |||||
if (startoff >= *inoffp + len) { | |||||
Done Inline ActionsAgain, this could be xfer = MIN(startoff - *inoffp, len). Or is there a reason you don't use the MIN and MAX macros? asomers: Again, this could be `xfer = MIN(startoff - *inoffp, len)`. Or is there a reason you don't use… | |||||
Done Inline ActionsWell, there are a couple of reasons I didn't use them, but as you can see, I have changed macros (or type cast the arguments in MAX()). This might break this code, since some variables are off_t (always int64_t I think) and some are size_t (uint32_t or uint64_t). 2 - When I write: x = y - z; if (x > X) x = X; I read that as "I want x to be y - z", but then I want to clip it at X. It's just the way I think when I code these things. For x = MIN(y - z, X), I read this as "take the lesser of y - z and X and then need to figure out why that is the case. But that's just me. I agree that for the "blksize" case, the MAX() macros make the code more readable for me, One minor nit here...you never say "in my opinion" for any of these comments and, rmacklem: Well, there are a couple of reasons I didn't use them, but as you can see, I have changed
the… | |||||
readzeros = true; | |||||
error = EINVAL; | |||||
} | |||||
} | |||||
if (error == 0) { | |||||
Done Inline ActionsAnother good use for MIN asomers: Another good use for `MIN` | |||||
if (startoff > *inoffp) { | |||||
/* Found hole before data block. */ | |||||
xfer = startoff - *inoffp; | |||||
if (xfer > len) | |||||
xfer = len; | |||||
if (*inoffp < va.va_size) { | |||||
/* Must write 0s to punch hole. */ | |||||
xfer2 = va.va_size - *inoffp; | |||||
if (xfer2 > xfer) | |||||
xfer2 = xfer; | |||||
if (xfer2 > blksize) | |||||
memset(dat, 0, blksize); | |||||
else | |||||
memset(dat, 0, xfer2); | |||||
error = vn_write_outvp(outvp, dat, | |||||
*outoffp, xfer2, blksize, outcred); | |||||
} | |||||
if (error == 0) { | |||||
*inoffp += xfer; | |||||
*outoffp += xfer; | |||||
len -= xfer; | |||||
} | |||||
Done Inline ActionsUse MIN here. asomers: Use `MIN` here. | |||||
} | |||||
copylen = len; | |||||
if (copylen > endoff - startoff) | |||||
copylen = endoff - startoff; | |||||
cantseek = false; | |||||
} else { | |||||
cantseek = true; | |||||
startoff = *inoffp; | |||||
copylen = len; | |||||
error = 0; | |||||
} | |||||
xfer = blksize; | |||||
if (cantseek) { | |||||
/* | |||||
* Set first xfer to end at a block boundary, so that | |||||
* holes are more likely detected in the loop below via | |||||
* the for all bytes 0 method. | |||||
*/ | |||||
xfer -= (*inoffp % blksize); | |||||
} | |||||
if (readzeros) | |||||
memset(dat, 0, blksize); | |||||
/* Loop copying the data block. */ | |||||
while (copylen > 0 && error == 0) { | |||||
if (copylen < xfer) | |||||
xfer = copylen; | |||||
if (!readzeros) { | |||||
error = vn_lock(invp, LK_SHARED); | |||||
if (error != 0) | |||||
goto out; | |||||
error = vn_rdwr(UIO_READ, invp, dat, xfer, | |||||
startoff, UIO_SYSSPACE, IO_NODELOCKED, | |||||
curthread->td_ucred, incred, &aresid, | |||||
curthread); | |||||
VOP_UNLOCK(invp, 0); | |||||
/* | |||||
* Linux considers a range that exceeds EOF to | |||||
Not Done Inline ActionsExcess () around cantseek. kib: Excess () around cantseek. | |||||
* be an error, so we will too. | |||||
*/ | |||||
if (error == 0 && aresid > 0) | |||||
error = EINVAL; | |||||
} | |||||
if (error == 0) { | |||||
/* | |||||
* Skip the write for holes past the initial EOF | |||||
* of the output file, unless this is the last | |||||
* write of the output file at EOF. | |||||
*/ | |||||
if (!cantseek || *outoffp < va.va_size || | |||||
xfer == len || !(readzeros || | |||||
mem_iszero(dat, xfer))) | |||||
error = vn_write_outvp(outvp, dat, | |||||
*outoffp, xfer, blksize, outcred); | |||||
if (error == 0) { | |||||
*inoffp += xfer; | |||||
startoff += xfer; | |||||
*outoffp += xfer; | |||||
copylen -= xfer; | |||||
len -= xfer; | |||||
} | |||||
} | |||||
xfer = blksize; | |||||
} | |||||
} | |||||
out: | |||||
*lenp = savlen - len; | |||||
free(dat, M_TEMP); | |||||
return (error); | |||||
} | } |
s/addr/dat/