Changeset View
Changeset View
Standalone View
Standalone View
sys/fs/fuse/fuse_io.c
Show All 27 Lines | |||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
* | * | ||||
* Copyright (C) 2005 Csaba Henk. | * Copyright (C) 2005 Csaba Henk. | ||||
* All rights reserved. | * All rights reserved. | ||||
* | * | ||||
* Copyright (c) 2019 The FreeBSD Foundation | |||||
* | |||||
* Portions of this software were developed by BFF Storage Systems, LLC under | |||||
* sponsorship from the FreeBSD Foundation. | |||||
* | |||||
* Redistribution and use in source and binary forms, with or without | * Redistribution and use in source and binary forms, with or without | ||||
* modification, are permitted provided that the following conditions | * modification, are permitted provided that the following conditions | ||||
* are met: | * are met: | ||||
* 1. Redistributions of source code must retain the above copyright | * 1. Redistributions of source code must retain the above copyright | ||||
* notice, this list of conditions and the following disclaimer. | * notice, this list of conditions and the following disclaimer. | ||||
* 2. Redistributions in binary form must reproduce the above copyright | * 2. Redistributions in binary form must reproduce the above copyright | ||||
* notice, this list of conditions and the following disclaimer in the | * notice, this list of conditions and the following disclaimer in the | ||||
* documentation and/or other materials provided with the distribution. | * documentation and/or other materials provided with the distribution. | ||||
Show All 23 Lines | |||||
#include <sys/conf.h> | #include <sys/conf.h> | ||||
#include <sys/uio.h> | #include <sys/uio.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/queue.h> | #include <sys/queue.h> | ||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||
#include <sys/sx.h> | #include <sys/sx.h> | ||||
#include <sys/mutex.h> | #include <sys/mutex.h> | ||||
#include <sys/rwlock.h> | #include <sys/rwlock.h> | ||||
#include <sys/priv.h> | |||||
#include <sys/proc.h> | #include <sys/proc.h> | ||||
#include <sys/mount.h> | #include <sys/mount.h> | ||||
#include <sys/vnode.h> | #include <sys/vnode.h> | ||||
#include <sys/stat.h> | #include <sys/stat.h> | ||||
#include <sys/unistd.h> | #include <sys/unistd.h> | ||||
#include <sys/filedesc.h> | #include <sys/filedesc.h> | ||||
#include <sys/file.h> | #include <sys/file.h> | ||||
#include <sys/fcntl.h> | #include <sys/fcntl.h> | ||||
#include <sys/bio.h> | #include <sys/bio.h> | ||||
#include <sys/buf.h> | #include <sys/buf.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <sys/vmmeter.h> | |||||
#include <vm/vm.h> | #include <vm/vm.h> | ||||
#include <vm/vm_extern.h> | #include <vm/vm_extern.h> | ||||
#include <vm/pmap.h> | #include <vm/pmap.h> | ||||
#include <vm/vm_map.h> | #include <vm/vm_map.h> | ||||
#include <vm/vm_page.h> | #include <vm/vm_page.h> | ||||
#include <vm/vm_object.h> | #include <vm/vm_object.h> | ||||
#include "fuse.h" | #include "fuse.h" | ||||
#include "fuse_file.h" | #include "fuse_file.h" | ||||
#include "fuse_node.h" | #include "fuse_node.h" | ||||
#include "fuse_internal.h" | #include "fuse_internal.h" | ||||
#include "fuse_ipc.h" | #include "fuse_ipc.h" | ||||
#include "fuse_io.h" | #include "fuse_io.h" | ||||
SDT_PROVIDER_DECLARE(fuse); | |||||
/* | /* | ||||
* Set in a struct buf to indicate that the write came from the buffer cache | |||||
* and the originating cred and pid are no longer known. | |||||
*/ | |||||
#define B_FUSEFS_WRITE_CACHE B_FS_FLAG1 | |||||
SDT_PROVIDER_DECLARE(fusefs); | |||||
/* | |||||
* Fuse trace probe: | * Fuse trace probe: | ||||
* arg0: verbosity. Higher numbers give more verbose messages | * arg0: verbosity. Higher numbers give more verbose messages | ||||
* arg1: Textual message | * arg1: Textual message | ||||
*/ | */ | ||||
SDT_PROBE_DEFINE2(fuse, , io, trace, "int", "char*"); | SDT_PROBE_DEFINE2(fusefs, , io, trace, "int", "char*"); | ||||
static int | static int | ||||
fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end); | |||||
static void | |||||
fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred, | |||||
struct thread *td); | |||||
static int | |||||
fuse_read_directbackend(struct vnode *vp, struct uio *uio, | fuse_read_directbackend(struct vnode *vp, struct uio *uio, | ||||
struct ucred *cred, struct fuse_filehandle *fufh); | struct ucred *cred, struct fuse_filehandle *fufh); | ||||
static int | static int | ||||
fuse_read_biobackend(struct vnode *vp, struct uio *uio, | fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, | ||||
struct ucred *cred, struct fuse_filehandle *fufh); | struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid); | ||||
static int | static int | ||||
fuse_write_directbackend(struct vnode *vp, struct uio *uio, | fuse_write_directbackend(struct vnode *vp, struct uio *uio, | ||||
struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); | struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, | ||||
int ioflag, bool pages); | |||||
static int | static int | ||||
fuse_write_biobackend(struct vnode *vp, struct uio *uio, | fuse_write_biobackend(struct vnode *vp, struct uio *uio, | ||||
struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); | struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid); | ||||
SDT_PROBE_DEFINE5(fuse, , io, io_dispatch, "struct vnode*", "struct uio*", | /* Invalidate a range of cached data, whether dirty of not */ | ||||
static int | |||||
fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end) | |||||
{ | |||||
struct buf *bp; | |||||
daddr_t left_lbn, end_lbn, right_lbn; | |||||
off_t new_filesize; | |||||
int iosize, left_on, right_on, right_blksize; | |||||
iosize = fuse_iosize(vp); | |||||
left_lbn = start / iosize; | |||||
end_lbn = howmany(end, iosize); | |||||
left_on = start & (iosize - 1); | |||||
if (left_on != 0) { | |||||
bp = getblk(vp, left_lbn, iosize, PCATCH, 0, 0); | |||||
if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyend >= left_on) { | |||||
/* | |||||
* Flush the dirty buffer, because we don't have a | |||||
* byte-granular way to record which parts of the | |||||
* buffer are valid. | |||||
*/ | |||||
bwrite(bp); | |||||
if (bp->b_error) | |||||
return (bp->b_error); | |||||
} else { | |||||
brelse(bp); | |||||
} | |||||
} | |||||
right_on = end & (iosize - 1); | |||||
if (right_on != 0) { | |||||
right_lbn = end / iosize; | |||||
new_filesize = MAX(filesize, end); | |||||
right_blksize = MIN(iosize, new_filesize - iosize * right_lbn); | |||||
bp = getblk(vp, right_lbn, right_blksize, PCATCH, 0, 0); | |||||
if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyoff < right_on) { | |||||
/* | |||||
* Flush the dirty buffer, because we don't have a | |||||
* byte-granular way to record which parts of the | |||||
* buffer are valid. | |||||
*/ | |||||
bwrite(bp); | |||||
if (bp->b_error) | |||||
return (bp->b_error); | |||||
} else { | |||||
brelse(bp); | |||||
} | |||||
} | |||||
v_inval_buf_range(vp, left_lbn, end_lbn, iosize); | |||||
return (0); | |||||
} | |||||
/* | |||||
* FreeBSD clears the SUID and SGID bits on any write by a non-root user. | |||||
*/ | |||||
static void | |||||
fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred, | |||||
struct thread *td) | |||||
{ | |||||
struct fuse_data *data; | |||||
struct mount *mp; | |||||
struct vattr va; | |||||
int dataflags; | |||||
mp = vnode_mount(vp); | |||||
data = fuse_get_mpdata(mp); | |||||
dataflags = data->dataflags; | |||||
if (dataflags & FSESS_DEFAULT_PERMISSIONS) { | |||||
if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { | |||||
fuse_internal_getattr(vp, &va, cred, td); | |||||
if (va.va_mode & (S_ISUID | S_ISGID)) { | |||||
mode_t mode = va.va_mode & ~(S_ISUID | S_ISGID); | |||||
/* Clear all vattr fields except mode */ | |||||
vattr_null(&va); | |||||
va.va_mode = mode; | |||||
/* | |||||
* Ignore fuse_internal_setattr's return value, | |||||
* because at this point the write operation has | |||||
* already succeeded and we don't want to return | |||||
* failing status for that. | |||||
*/ | |||||
(void)fuse_internal_setattr(vp, &va, td, NULL); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
SDT_PROBE_DEFINE5(fusefs, , io, io_dispatch, "struct vnode*", "struct uio*", | |||||
"int", "struct ucred*", "struct fuse_filehandle*"); | "int", "struct ucred*", "struct fuse_filehandle*"); | ||||
SDT_PROBE_DEFINE4(fusefs, , io, io_dispatch_filehandles_closed, "struct vnode*", | |||||
"struct uio*", "int", "struct ucred*"); | |||||
int | int | ||||
fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, | fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, | ||||
struct ucred *cred) | struct ucred *cred, pid_t pid) | ||||
{ | { | ||||
struct fuse_filehandle *fufh; | struct fuse_filehandle *fufh; | ||||
int err, directio; | int err, directio; | ||||
int fflag; | |||||
bool closefufh = false; | |||||
MPASS(vp->v_type == VREG || vp->v_type == VDIR); | MPASS(vp->v_type == VREG || vp->v_type == VDIR); | ||||
err = fuse_filehandle_getrw(vp, | fflag = (uio->uio_rw == UIO_READ) ? FREAD : FWRITE; | ||||
(uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); | err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); | ||||
if (err) { | if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { | ||||
/* | |||||
* nfsd will do I/O without first doing VOP_OPEN. We | |||||
* must implicitly open the file here | |||||
*/ | |||||
err = fuse_filehandle_open(vp, fflag, &fufh, curthread, cred); | |||||
closefufh = true; | |||||
} | |||||
else if (err) { | |||||
SDT_PROBE4(fusefs, , io, io_dispatch_filehandles_closed, | |||||
vp, uio, ioflag, cred); | |||||
printf("FUSE: io dispatch: filehandles are closed\n"); | printf("FUSE: io dispatch: filehandles are closed\n"); | ||||
return err; | return err; | ||||
} | } | ||||
SDT_PROBE5(fuse, , io, io_dispatch, vp, uio, ioflag, cred, fufh); | if (err) | ||||
goto out; | |||||
SDT_PROBE5(fusefs, , io, io_dispatch, vp, uio, ioflag, cred, fufh); | |||||
/* | /* | ||||
* Ideally, when the daemon asks for direct io at open time, the | * Ideally, when the daemon asks for direct io at open time, the | ||||
* standard file flag should be set according to this, so that would | * standard file flag should be set according to this, so that would | ||||
* just change the default mode, which later on could be changed via | * just change the default mode, which later on could be changed via | ||||
* fcntl(2). | * fcntl(2). | ||||
* But this doesn't work, the O_DIRECT flag gets cleared at some point | * But this doesn't work, the O_DIRECT flag gets cleared at some point | ||||
* (don't know where). So to make any use of the Fuse direct_io option, | * (don't know where). So to make any use of the Fuse direct_io option, | ||||
* we hardwire it into the file's private data (similarly to Linux, | * we hardwire it into the file's private data (similarly to Linux, | ||||
* btw.). | * btw.). | ||||
*/ | */ | ||||
directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); | directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); | ||||
switch (uio->uio_rw) { | switch (uio->uio_rw) { | ||||
case UIO_READ: | case UIO_READ: | ||||
if (directio) { | if (directio) { | ||||
SDT_PROBE2(fuse, , io, trace, 1, | SDT_PROBE2(fusefs, , io, trace, 1, | ||||
"direct read of vnode"); | "direct read of vnode"); | ||||
err = fuse_read_directbackend(vp, uio, cred, fufh); | err = fuse_read_directbackend(vp, uio, cred, fufh); | ||||
} else { | } else { | ||||
SDT_PROBE2(fuse, , io, trace, 1, | SDT_PROBE2(fusefs, , io, trace, 1, | ||||
"buffered read of vnode"); | "buffered read of vnode"); | ||||
err = fuse_read_biobackend(vp, uio, cred, fufh); | err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh, | ||||
pid); | |||||
} | } | ||||
break; | break; | ||||
case UIO_WRITE: | case UIO_WRITE: | ||||
/* | fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); | ||||
* Kludge: simulate write-through caching via write-around | if (directio) { | ||||
* caching. Same effect, as far as never caching dirty data, | off_t start, end, filesize; | ||||
* but slightly pessimal in that newly written data is not | |||||
* cached. | SDT_PROBE2(fusefs, , io, trace, 1, | ||||
*/ | |||||
if (directio || fuse_data_cache_mode == FUSE_CACHE_WT) { | |||||
SDT_PROBE2(fuse, , io, trace, 1, | |||||
"direct write of vnode"); | "direct write of vnode"); | ||||
err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag); | |||||
err = fuse_vnode_size(vp, &filesize, cred, curthread); | |||||
if (err) | |||||
goto out; | |||||
start = uio->uio_offset; | |||||
end = start + uio->uio_resid; | |||||
KASSERT((ioflag & (IO_VMIO | IO_DIRECT)) != | |||||
(IO_VMIO | IO_DIRECT), | |||||
("IO_DIRECT used for a cache flush?")); | |||||
/* Invalidate the write cache when writing directly */ | |||||
err = fuse_inval_buf_range(vp, filesize, start, end); | |||||
if (err) | |||||
return (err); | |||||
err = fuse_write_directbackend(vp, uio, cred, fufh, | |||||
filesize, ioflag, false); | |||||
} else { | } else { | ||||
SDT_PROBE2(fuse, , io, trace, 1, | SDT_PROBE2(fusefs, , io, trace, 1, | ||||
"buffered write of vnode"); | "buffered write of vnode"); | ||||
err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag); | if (!fsess_opt_writeback(vnode_mount(vp))) | ||||
ioflag |= IO_SYNC; | |||||
err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag, | |||||
pid); | |||||
} | } | ||||
fuse_io_clear_suid_on_write(vp, cred, uio->uio_td); | |||||
break; | break; | ||||
default: | default: | ||||
panic("uninterpreted mode passed to fuse_io_dispatch"); | panic("uninterpreted mode passed to fuse_io_dispatch"); | ||||
} | } | ||||
out: | |||||
if (closefufh) | |||||
fuse_filehandle_close(vp, fufh, curthread, cred); | |||||
return (err); | return (err); | ||||
} | } | ||||
SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_start, "int", "int", "int"); | SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int"); | ||||
SDT_PROBE_DEFINE2(fuse, , io, read_bio_backend_feed, "int", "int"); | SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "struct buf*"); | ||||
SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_end, "int", "ssize_t", "int"); | SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int", | ||||
"struct buf*"); | |||||
static int | static int | ||||
fuse_read_biobackend(struct vnode *vp, struct uio *uio, | fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, | ||||
struct ucred *cred, struct fuse_filehandle *fufh) | struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid) | ||||
{ | { | ||||
struct buf *bp; | struct buf *bp; | ||||
daddr_t lbn; | struct mount *mp; | ||||
int bcount; | struct fuse_data *data; | ||||
int err = 0, n = 0, on = 0; | daddr_t lbn, nextlbn; | ||||
int bcount, nextsize; | |||||
int err, n = 0, on = 0, seqcount; | |||||
off_t filesize; | off_t filesize; | ||||
const int biosize = fuse_iosize(vp); | const int biosize = fuse_iosize(vp); | ||||
mp = vnode_mount(vp); | |||||
data = fuse_get_mpdata(mp); | |||||
if (uio->uio_resid == 0) | |||||
return (0); | |||||
if (uio->uio_offset < 0) | if (uio->uio_offset < 0) | ||||
return (EINVAL); | return (EINVAL); | ||||
bcount = biosize; | seqcount = ioflag >> IO_SEQSHIFT; | ||||
filesize = VTOFUD(vp)->filesize; | |||||
do { | err = fuse_vnode_size(vp, &filesize, cred, curthread); | ||||
if (err) | |||||
return err; | |||||
for (err = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { | |||||
if (fuse_isdeadfs(vp)) { | if (fuse_isdeadfs(vp)) { | ||||
err = ENXIO; | err = ENXIO; | ||||
break; | break; | ||||
} | } | ||||
if (filesize - uio->uio_offset <= 0) | |||||
break; | |||||
lbn = uio->uio_offset / biosize; | lbn = uio->uio_offset / biosize; | ||||
on = uio->uio_offset & (biosize - 1); | on = uio->uio_offset & (biosize - 1); | ||||
SDT_PROBE3(fuse, , io, read_bio_backend_start, | |||||
biosize, (int)lbn, on); | |||||
/* | |||||
* Obtain the buffer cache block. Figure out the buffer size | |||||
* when we are at EOF. If we are modifying the size of the | |||||
* buffer based on an EOF condition we need to hold | |||||
* nfs_rslock() through obtaining the buffer to prevent | |||||
* a potential writer-appender from messing with n_size. | |||||
* Otherwise we may accidentally truncate the buffer and | |||||
* lose dirty data. | |||||
* | |||||
* Note that bcount is *not* DEV_BSIZE aligned. | |||||
*/ | |||||
if ((off_t)lbn * biosize >= filesize) { | if ((off_t)lbn * biosize >= filesize) { | ||||
bcount = 0; | bcount = 0; | ||||
} else if ((off_t)(lbn + 1) * biosize > filesize) { | } else if ((off_t)(lbn + 1) * biosize > filesize) { | ||||
bcount = filesize - (off_t)lbn *biosize; | bcount = filesize - (off_t)lbn *biosize; | ||||
} else { | |||||
bcount = biosize; | |||||
} | } | ||||
bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); | nextlbn = lbn + 1; | ||||
nextsize = MIN(biosize, filesize - nextlbn * biosize); | |||||
if (!bp) | SDT_PROBE4(fusefs, , io, read_bio_backend_start, | ||||
return (EINTR); | biosize, (int)lbn, on, bcount); | ||||
/* | if (bcount < biosize) { | ||||
* If B_CACHE is not set, we must issue the read. If this | /* If near EOF, don't do readahead */ | ||||
* fails, we return an error. | err = bread(vp, lbn, bcount, NOCRED, &bp); | ||||
*/ | } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { | ||||
/* Try clustered read */ | |||||
long totread = uio->uio_resid + on; | |||||
seqcount = MIN(seqcount, | |||||
data->max_readahead_blocks + 1); | |||||
err = cluster_read(vp, filesize, lbn, bcount, NOCRED, | |||||
totread, seqcount, 0, &bp); | |||||
} else if (seqcount > 1 && data->max_readahead_blocks >= 1) { | |||||
/* Try non-clustered readahead */ | |||||
err = breadn(vp, lbn, bcount, &nextlbn, &nextsize, 1, | |||||
NOCRED, &bp); | |||||
} else { | |||||
/* Just read what was requested */ | |||||
err = bread(vp, lbn, bcount, NOCRED, &bp); | |||||
} | |||||
if ((bp->b_flags & B_CACHE) == 0) { | |||||
bp->b_iocmd = BIO_READ; | |||||
vfs_busy_pages(bp, 0); | |||||
err = fuse_io_strategy(vp, bp); | |||||
if (err) { | if (err) { | ||||
brelse(bp); | brelse(bp); | ||||
return (err); | bp = NULL; | ||||
break; | |||||
} | } | ||||
} | |||||
/* | /* | ||||
* on is the offset into the current bp. Figure out how many | * on is the offset into the current bp. Figure out how many | ||||
* bytes we can copy out of the bp. Note that bcount is | * bytes we can copy out of the bp. Note that bcount is | ||||
* NOT DEV_BSIZE aligned. | * NOT DEV_BSIZE aligned. | ||||
* | * | ||||
* Then figure out how many bytes we can copy into the uio. | * Then figure out how many bytes we can copy into the uio. | ||||
*/ | */ | ||||
n = 0; | n = 0; | ||||
if (on < bcount) | if (on < bcount - bp->b_resid) | ||||
n = MIN((unsigned)(bcount - on), uio->uio_resid); | n = MIN((unsigned)(bcount - bp->b_resid - on), | ||||
uio->uio_resid); | |||||
if (n > 0) { | if (n > 0) { | ||||
SDT_PROBE2(fuse, , io, read_bio_backend_feed, | SDT_PROBE2(fusefs, , io, read_bio_backend_feed, n, bp); | ||||
n, n + (int)bp->b_resid); | |||||
err = uiomove(bp->b_data + on, n, uio); | err = uiomove(bp->b_data + on, n, uio); | ||||
} | } | ||||
brelse(bp); | vfs_bio_brelse(bp, ioflag); | ||||
SDT_PROBE3(fuse, , io, read_bio_backend_end, err, | SDT_PROBE4(fusefs, , io, read_bio_backend_end, err, | ||||
uio->uio_resid, n); | uio->uio_resid, n, bp); | ||||
} while (err == 0 && uio->uio_resid > 0 && n > 0); | if (bp->b_resid > 0) { | ||||
/* Short read indicates EOF */ | |||||
break; | |||||
} | |||||
} | |||||
return (err); | return (err); | ||||
} | } | ||||
SDT_PROBE_DEFINE1(fuse, , io, read_directbackend_start, "struct fuse_read_in*"); | SDT_PROBE_DEFINE1(fusefs, , io, read_directbackend_start, | ||||
SDT_PROBE_DEFINE2(fuse, , io, read_directbackend_complete, | "struct fuse_read_in*"); | ||||
"struct fuse_dispatcher*", "struct uio*"); | SDT_PROBE_DEFINE3(fusefs, , io, read_directbackend_complete, | ||||
"struct fuse_dispatcher*", "struct fuse_read_in*", "struct uio*"); | |||||
static int | static int | ||||
fuse_read_directbackend(struct vnode *vp, struct uio *uio, | fuse_read_directbackend(struct vnode *vp, struct uio *uio, | ||||
struct ucred *cred, struct fuse_filehandle *fufh) | struct ucred *cred, struct fuse_filehandle *fufh) | ||||
{ | { | ||||
struct fuse_data *data; | |||||
struct fuse_dispatcher fdi; | struct fuse_dispatcher fdi; | ||||
struct fuse_read_in *fri; | struct fuse_read_in *fri; | ||||
int err = 0; | int err = 0; | ||||
data = fuse_get_mpdata(vp->v_mount); | |||||
if (uio->uio_resid == 0) | if (uio->uio_resid == 0) | ||||
return (0); | return (0); | ||||
fdisp_init(&fdi, 0); | fdisp_init(&fdi, 0); | ||||
/* | /* | ||||
* XXX In "normal" case we use an intermediate kernel buffer for | * XXX In "normal" case we use an intermediate kernel buffer for | ||||
* transmitting data from daemon's context to ours. Eventually, we should | * transmitting data from daemon's context to ours. Eventually, we should | ||||
* get rid of this. Anyway, if the target uio lives in sysspace (we are | * get rid of this. Anyway, if the target uio lives in sysspace (we are | ||||
* called from pageops), and the input data doesn't need kernel-side | * called from pageops), and the input data doesn't need kernel-side | ||||
* processing (we are not called from readdir) we can already invoke | * processing (we are not called from readdir) we can already invoke | ||||
* an optimized, "peer-to-peer" I/O routine. | * an optimized, "peer-to-peer" I/O routine. | ||||
*/ | */ | ||||
while (uio->uio_resid > 0) { | while (uio->uio_resid > 0) { | ||||
fdi.iosize = sizeof(*fri); | fdi.iosize = sizeof(*fri); | ||||
fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); | fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); | ||||
fri = fdi.indata; | fri = fdi.indata; | ||||
fri->fh = fufh->fh_id; | fri->fh = fufh->fh_id; | ||||
fri->offset = uio->uio_offset; | fri->offset = uio->uio_offset; | ||||
fri->size = MIN(uio->uio_resid, | fri->size = MIN(uio->uio_resid, | ||||
fuse_get_mpdata(vp->v_mount)->max_read); | fuse_get_mpdata(vp->v_mount)->max_read); | ||||
if (fuse_libabi_geq(data, 7, 9)) { | |||||
/* See comment regarding FUSE_WRITE_LOCKOWNER */ | |||||
fri->read_flags = 0; | |||||
fri->flags = fufh_type_2_fflags(fufh->fufh_type); | |||||
} | |||||
SDT_PROBE1(fuse, , io, read_directbackend_start, fri); | SDT_PROBE1(fusefs, , io, read_directbackend_start, fri); | ||||
if ((err = fdisp_wait_answ(&fdi))) | if ((err = fdisp_wait_answ(&fdi))) | ||||
goto out; | goto out; | ||||
SDT_PROBE2(fuse, , io, read_directbackend_complete, | SDT_PROBE3(fusefs, , io, read_directbackend_complete, | ||||
fdi.iosize, uio); | &fdi, fri, uio); | ||||
if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) | if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) | ||||
break; | break; | ||||
if (fdi.iosize < fri->size) | if (fdi.iosize < fri->size) { | ||||
/* | |||||
* Short read. Should only happen at EOF or with | |||||
* direct io. | |||||
*/ | |||||
break; | break; | ||||
} | } | ||||
} | |||||
out: | out: | ||||
fdisp_destroy(&fdi); | fdisp_destroy(&fdi); | ||||
return (err); | return (err); | ||||
} | } | ||||
static int | static int | ||||
fuse_write_directbackend(struct vnode *vp, struct uio *uio, | fuse_write_directbackend(struct vnode *vp, struct uio *uio, | ||||
struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) | struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, | ||||
int ioflag, bool pages) | |||||
{ | { | ||||
struct fuse_vnode_data *fvdat = VTOFUD(vp); | struct fuse_vnode_data *fvdat = VTOFUD(vp); | ||||
struct fuse_data *data; | |||||
struct fuse_write_in *fwi; | struct fuse_write_in *fwi; | ||||
struct fuse_write_out *fwo; | |||||
struct fuse_dispatcher fdi; | struct fuse_dispatcher fdi; | ||||
size_t chunksize; | size_t chunksize; | ||||
void *fwi_data; | |||||
off_t as_written_offset; | |||||
int diff; | int diff; | ||||
int err = 0; | int err = 0; | ||||
bool direct_io = fufh->fuse_open_flags & FOPEN_DIRECT_IO; | |||||
bool wrote_anything = false; | |||||
uint32_t write_flags; | |||||
data = fuse_get_mpdata(vp->v_mount); | |||||
/* | |||||
* Don't set FUSE_WRITE_LOCKOWNER in write_flags. It can't be set | |||||
* accurately when using POSIX AIO, libfuse doesn't use it, and I'm not | |||||
* aware of any file systems that do. It was an attempt to add | |||||
* Linux-style mandatory locking to the FUSE protocol, but mandatory | |||||
* locking is deprecated even on Linux. See Linux commit | |||||
* f33321141b273d60cbb3a8f56a5489baad82ba5e . | |||||
*/ | |||||
/* | |||||
* Set FUSE_WRITE_CACHE whenever we don't know the uid, gid, and/or pid | |||||
* that originated a write. For example when writing from the | |||||
* writeback cache. I don't know of a single file system that cares, | |||||
* but the protocol says we're supposed to do this. | |||||
*/ | |||||
write_flags = !pages && ( | |||||
(ioflag & IO_DIRECT) || | |||||
!fsess_opt_datacache(vnode_mount(vp)) || | |||||
!fsess_opt_writeback(vnode_mount(vp))) ? 0 : FUSE_WRITE_CACHE; | |||||
if (uio->uio_resid == 0) | if (uio->uio_resid == 0) | ||||
return (0); | return (0); | ||||
if (ioflag & IO_APPEND) | if (ioflag & IO_APPEND) | ||||
uio_setoffset(uio, fvdat->filesize); | uio_setoffset(uio, filesize); | ||||
if (vn_rlimit_fsize(vp, uio, uio->uio_td)) | |||||
return (EFBIG); | |||||
fdisp_init(&fdi, 0); | fdisp_init(&fdi, 0); | ||||
while (uio->uio_resid > 0) { | while (uio->uio_resid > 0) { | ||||
chunksize = MIN(uio->uio_resid, | chunksize = MIN(uio->uio_resid, data->max_write); | ||||
fuse_get_mpdata(vp->v_mount)->max_write); | |||||
fdi.iosize = sizeof(*fwi) + chunksize; | fdi.iosize = sizeof(*fwi) + chunksize; | ||||
fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); | fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); | ||||
fwi = fdi.indata; | fwi = fdi.indata; | ||||
fwi->fh = fufh->fh_id; | fwi->fh = fufh->fh_id; | ||||
fwi->offset = uio->uio_offset; | fwi->offset = uio->uio_offset; | ||||
fwi->size = chunksize; | fwi->size = chunksize; | ||||
fwi->write_flags = write_flags; | |||||
if (fuse_libabi_geq(data, 7, 9)) { | |||||
fwi->flags = fufh_type_2_fflags(fufh->fufh_type); | |||||
fwi_data = (char *)fdi.indata + sizeof(*fwi); | |||||
} else { | |||||
fwi_data = (char *)fdi.indata + | |||||
FUSE_COMPAT_WRITE_IN_SIZE; | |||||
} | |||||
if ((err = uiomove((char *)fdi.indata + sizeof(*fwi), | if ((err = uiomove(fwi_data, chunksize, uio))) | ||||
chunksize, uio))) | |||||
break; | break; | ||||
if ((err = fdisp_wait_answ(&fdi))) | retry: | ||||
err = fdisp_wait_answ(&fdi); | |||||
if (err == ERESTART || err == EINTR || err == EWOULDBLOCK) { | |||||
/* | |||||
* Rewind the uio so dofilewrite will know it's | |||||
* incomplete | |||||
*/ | |||||
uio->uio_resid += fwi->size; | |||||
uio->uio_offset -= fwi->size; | |||||
/* | |||||
* Change ERESTART into EINTR because we can't rewind | |||||
* uio->uio_iov. Basically, once uiomove(9) has been | |||||
* called, it's impossible to restart a syscall. | |||||
*/ | |||||
if (err == ERESTART) | |||||
err = EINTR; | |||||
break; | break; | ||||
} else if (err) { | |||||
break; | |||||
} else { | |||||
wrote_anything = true; | |||||
} | |||||
fwo = ((struct fuse_write_out *)fdi.answ); | |||||
/* Adjust the uio in the case of short writes */ | /* Adjust the uio in the case of short writes */ | ||||
diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size; | diff = fwi->size - fwo->size; | ||||
as_written_offset = uio->uio_offset - diff; | |||||
if (as_written_offset - diff > filesize) | |||||
fuse_vnode_setsize(vp, as_written_offset); | |||||
if (as_written_offset - diff >= filesize) | |||||
fvdat->flag &= ~FN_SIZECHANGE; | |||||
if (diff < 0) { | if (diff < 0) { | ||||
printf("WARNING: misbehaving FUSE filesystem " | |||||
"wrote more data than we provided it\n"); | |||||
err = EINVAL; | err = EINVAL; | ||||
break; | break; | ||||
} else if (diff > 0 && !(ioflag & IO_DIRECT)) { | } else if (diff > 0) { | ||||
/* | /* Short write */ | ||||
* XXX We really should be directly checking whether | if (!direct_io) { | ||||
* the file was opened with FOPEN_DIRECT_IO, not | printf("WARNING: misbehaving FUSE filesystem: " | ||||
* IO_DIRECT. IO_DIRECT can be set in multiple ways. | "short writes are only allowed with " | ||||
*/ | "direct_io\n"); | ||||
SDT_PROBE2(fuse, , io, trace, 1, | |||||
"misbehaving filesystem: short writes are only " | |||||
"allowed with direct_io"); | |||||
} | } | ||||
if (ioflag & IO_DIRECT) { | |||||
/* Return early */ | |||||
uio->uio_resid += diff; | uio->uio_resid += diff; | ||||
uio->uio_offset -= diff; | uio->uio_offset -= diff; | ||||
break; | |||||
if (uio->uio_offset > fvdat->filesize && | } else { | ||||
fuse_data_cache_mode != FUSE_CACHE_UC) { | /* Resend the unwritten portion of data */ | ||||
fuse_vnode_setsize(vp, uio->uio_offset); | fdi.iosize = sizeof(*fwi) + diff; | ||||
fvdat->flag &= ~FN_SIZECHANGE; | /* Refresh fdi without clearing data buffer */ | ||||
fdisp_refresh_vp(&fdi, FUSE_WRITE, vp, | |||||
uio->uio_td, cred); | |||||
fwi = fdi.indata; | |||||
MPASS2(fwi == fdi.indata, "FUSE dispatcher " | |||||
"reallocated despite no increase in " | |||||
"size?"); | |||||
void *src = (char*)fwi_data + fwo->size; | |||||
memmove(fwi_data, src, diff); | |||||
fwi->fh = fufh->fh_id; | |||||
fwi->offset = as_written_offset; | |||||
fwi->size = diff; | |||||
fwi->write_flags = write_flags; | |||||
goto retry; | |||||
} | } | ||||
} | } | ||||
} | |||||
fdisp_destroy(&fdi); | fdisp_destroy(&fdi); | ||||
if (wrote_anything) | |||||
fuse_vnode_undirty_cached_timestamps(vp); | |||||
return (err); | return (err); | ||||
} | } | ||||
SDT_PROBE_DEFINE6(fuse, , io, write_biobackend_start, "int64_t", "int", "int", | SDT_PROBE_DEFINE6(fusefs, , io, write_biobackend_start, "int64_t", "int", "int", | ||||
"struct uio*", "int", "bool"); | "struct uio*", "int", "bool"); | ||||
SDT_PROBE_DEFINE2(fuse, , io, write_biobackend_append_race, "long", "int"); | SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_append_race, "long", "int"); | ||||
SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_issue, "int", "struct buf*"); | |||||
static int | static int | ||||
fuse_write_biobackend(struct vnode *vp, struct uio *uio, | fuse_write_biobackend(struct vnode *vp, struct uio *uio, | ||||
struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) | struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid) | ||||
{ | { | ||||
struct fuse_vnode_data *fvdat = VTOFUD(vp); | struct fuse_vnode_data *fvdat = VTOFUD(vp); | ||||
struct buf *bp; | struct buf *bp; | ||||
daddr_t lbn; | daddr_t lbn; | ||||
off_t filesize; | |||||
int bcount; | int bcount; | ||||
int n, on, err = 0; | int n, on, seqcount, err = 0; | ||||
bool last_page; | |||||
const int biosize = fuse_iosize(vp); | const int biosize = fuse_iosize(vp); | ||||
KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); | seqcount = ioflag >> IO_SEQSHIFT; | ||||
KASSERT(uio->uio_rw == UIO_WRITE, ("fuse_write_biobackend mode")); | |||||
if (vp->v_type != VREG) | if (vp->v_type != VREG) | ||||
return (EIO); | return (EIO); | ||||
if (uio->uio_offset < 0) | if (uio->uio_offset < 0) | ||||
return (EINVAL); | return (EINVAL); | ||||
if (uio->uio_resid == 0) | if (uio->uio_resid == 0) | ||||
return (0); | return (0); | ||||
err = fuse_vnode_size(vp, &filesize, cred, curthread); | |||||
if (err) | |||||
return err; | |||||
if (ioflag & IO_APPEND) | if (ioflag & IO_APPEND) | ||||
uio_setoffset(uio, fvdat->filesize); | uio_setoffset(uio, filesize); | ||||
/* | if (vn_rlimit_fsize(vp, uio, uio->uio_td)) | ||||
* Find all of this file's B_NEEDCOMMIT buffers. If our writes | return (EFBIG); | ||||
* would exceed the local maximum per-file write commit size when | |||||
* combined with those, we must decide whether to flush, | |||||
* go synchronous, or return err. We don't bother checking | |||||
* IO_UNIT -- we just make all writes atomic anyway, as there's | |||||
* no point optimizing for something that really won't ever happen. | |||||
*/ | |||||
do { | do { | ||||
bool direct_append, extending; | |||||
if (fuse_isdeadfs(vp)) { | if (fuse_isdeadfs(vp)) { | ||||
err = ENXIO; | err = ENXIO; | ||||
break; | break; | ||||
} | } | ||||
lbn = uio->uio_offset / biosize; | lbn = uio->uio_offset / biosize; | ||||
on = uio->uio_offset & (biosize - 1); | on = uio->uio_offset & (biosize - 1); | ||||
n = MIN((unsigned)(biosize - on), uio->uio_resid); | n = MIN((unsigned)(biosize - on), uio->uio_resid); | ||||
again: | again: | ||||
/* Get or create a buffer for the write */ | |||||
direct_append = uio->uio_offset == filesize && n; | |||||
if (uio->uio_offset + n < filesize) { | |||||
extending = false; | |||||
if ((off_t)(lbn + 1) * biosize < filesize) { | |||||
/* Not the file's last block */ | |||||
bcount = biosize; | |||||
} else { | |||||
/* The file's last block */ | |||||
bcount = filesize - (off_t)lbn * biosize; | |||||
} | |||||
} else { | |||||
extending = true; | |||||
bcount = on + n; | |||||
} | |||||
if (howmany(((off_t)lbn * biosize + on + n - 1), PAGE_SIZE) >= | |||||
howmany(filesize, PAGE_SIZE)) | |||||
last_page = true; | |||||
else | |||||
last_page = false; | |||||
if (direct_append) { | |||||
/* | /* | ||||
* Handle direct append and file extension cases, calculate | * Take care to preserve the buffer's B_CACHE state so | ||||
* unaligned buffer size. | * as not to cause an unnecessary read. | ||||
*/ | */ | ||||
if (uio->uio_offset == fvdat->filesize && n) { | bp = getblk(vp, lbn, on, PCATCH, 0, 0); | ||||
/* | |||||
* Get the buffer (in its pre-append state to maintain | |||||
* B_CACHE if it was previously set). Resize the | |||||
* nfsnode after we have locked the buffer to prevent | |||||
* readers from reading garbage. | |||||
*/ | |||||
bcount = on; | |||||
SDT_PROBE6(fuse, , io, write_biobackend_start, | |||||
lbn, on, n, uio, bcount, true); | |||||
bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); | |||||
if (bp != NULL) { | if (bp != NULL) { | ||||
long save; | uint32_t save = bp->b_flags & B_CACHE; | ||||
err = fuse_vnode_setsize(vp, | |||||
uio->uio_offset + n); | |||||
if (err) { | |||||
brelse(bp); | |||||
break; | |||||
} | |||||
save = bp->b_flags & B_CACHE; | |||||
bcount += n; | |||||
allocbuf(bp, bcount); | allocbuf(bp, bcount); | ||||
bp->b_flags |= save; | bp->b_flags |= save; | ||||
} | } | ||||
} else { | } else { | ||||
bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); | |||||
} | |||||
if (!bp) { | |||||
err = EINTR; | |||||
break; | |||||
} | |||||
if (extending) { | |||||
/* | /* | ||||
* Obtain the locked cache block first, and then | * Extend file _after_ locking buffer so we won't race | ||||
* adjust the file's size as appropriate. | * with other readers | ||||
*/ | */ | ||||
bcount = on + n; | err = fuse_vnode_setsize(vp, uio->uio_offset + n); | ||||
if ((off_t)lbn * biosize + bcount < fvdat->filesize) { | filesize = uio->uio_offset + n; | ||||
if ((off_t)(lbn + 1) * biosize < fvdat->filesize) | fvdat->flag |= FN_SIZECHANGE; | ||||
bcount = biosize; | |||||
else | |||||
bcount = fvdat->filesize - | |||||
(off_t)lbn *biosize; | |||||
} | |||||
SDT_PROBE6(fuse, , io, write_biobackend_start, | |||||
lbn, on, n, uio, bcount, false); | |||||
bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); | |||||
if (bp && uio->uio_offset + n > fvdat->filesize) { | |||||
err = fuse_vnode_setsize(vp, | |||||
uio->uio_offset + n); | |||||
if (err) { | if (err) { | ||||
brelse(bp); | brelse(bp); | ||||
break; | break; | ||||
} | } | ||||
} | } | ||||
} | |||||
if (!bp) { | SDT_PROBE6(fusefs, , io, write_biobackend_start, | ||||
err = EINTR; | lbn, on, n, uio, bcount, direct_append); | ||||
break; | |||||
} | |||||
/* | /* | ||||
* Issue a READ if B_CACHE is not set. In special-append | * Issue a READ if B_CACHE is not set. In special-append | ||||
* mode, B_CACHE is based on the buffer prior to the write | * mode, B_CACHE is based on the buffer prior to the write | ||||
* op and is typically set, avoiding the read. If a read | * op and is typically set, avoiding the read. If a read | ||||
* is required in special append mode, the server will | * is required in special append mode, the server will | ||||
* probably send us a short-read since we extended the file | * probably send us a short-read since we extended the file | ||||
* on our end, resulting in b_resid == 0 and, thusly, | * on our end, resulting in b_resid == 0 and, thusly, | ||||
* B_CACHE getting set. | * B_CACHE getting set. | ||||
Show All 16 Lines | again: | ||||
if ((bp->b_flags & B_CACHE) == 0) { | if ((bp->b_flags & B_CACHE) == 0) { | ||||
bp->b_iocmd = BIO_READ; | bp->b_iocmd = BIO_READ; | ||||
vfs_busy_pages(bp, 0); | vfs_busy_pages(bp, 0); | ||||
fuse_io_strategy(vp, bp); | fuse_io_strategy(vp, bp); | ||||
if ((err = bp->b_error)) { | if ((err = bp->b_error)) { | ||||
brelse(bp); | brelse(bp); | ||||
break; | break; | ||||
} | } | ||||
if (bp->b_resid > 0) { | |||||
/* | |||||
* Short read indicates EOF. Update file size | |||||
* from the server and try again. | |||||
*/ | |||||
SDT_PROBE2(fusefs, , io, trace, 1, | |||||
"Short read during a RMW"); | |||||
brelse(bp); | |||||
err = fuse_vnode_size(vp, &filesize, cred, | |||||
curthread); | |||||
if (err) | |||||
break; | |||||
else | |||||
goto again; | |||||
} | } | ||||
} | |||||
if (bp->b_wcred == NOCRED) | if (bp->b_wcred == NOCRED) | ||||
bp->b_wcred = crhold(cred); | bp->b_wcred = crhold(cred); | ||||
/* | /* | ||||
* If dirtyend exceeds file size, chop it down. This should | * If dirtyend exceeds file size, chop it down. This should | ||||
* not normally occur but there is an append race where it | * not normally occur but there is an append race where it | ||||
* might occur XXX, so we log it. | * might occur XXX, so we log it. | ||||
* | * | ||||
* If the chopping creates a reverse-indexed or degenerate | * If the chopping creates a reverse-indexed or degenerate | ||||
* situation with dirtyoff/end, we 0 both of them. | * situation with dirtyoff/end, we 0 both of them. | ||||
*/ | */ | ||||
if (bp->b_dirtyend > bcount) { | if (bp->b_dirtyend > bcount) { | ||||
SDT_PROBE2(fuse, , io, write_biobackend_append_race, | SDT_PROBE2(fusefs, , io, write_biobackend_append_race, | ||||
(long)bp->b_blkno * biosize, | (long)bp->b_blkno * biosize, | ||||
bp->b_dirtyend - bcount); | bp->b_dirtyend - bcount); | ||||
bp->b_dirtyend = bcount; | bp->b_dirtyend = bcount; | ||||
} | } | ||||
if (bp->b_dirtyoff >= bp->b_dirtyend) | if (bp->b_dirtyoff >= bp->b_dirtyend) | ||||
bp->b_dirtyoff = bp->b_dirtyend = 0; | bp->b_dirtyoff = bp->b_dirtyend = 0; | ||||
/* | /* | ||||
Show All 16 Lines | again: | ||||
if (bp->b_dirtyend > 0 && | if (bp->b_dirtyend > 0 && | ||||
(on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { | (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { | ||||
/* | /* | ||||
* Yes, we mean it. Write out everything to "storage" | * Yes, we mean it. Write out everything to "storage" | ||||
* immediately, without hesitation. (Apart from other | * immediately, without hesitation. (Apart from other | ||||
* reasons: the only way to know if a write is valid | * reasons: the only way to know if a write is valid | ||||
* if its actually written out.) | * if its actually written out.) | ||||
*/ | */ | ||||
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 0, bp); | |||||
bwrite(bp); | bwrite(bp); | ||||
if (bp->b_error == EINTR) { | if (bp->b_error == EINTR) { | ||||
err = EINTR; | err = EINTR; | ||||
break; | break; | ||||
} | } | ||||
goto again; | goto again; | ||||
} | } | ||||
err = uiomove((char *)bp->b_data + on, n, uio); | err = uiomove((char *)bp->b_data + on, n, uio); | ||||
/* | |||||
* Since this block is being modified, it must be written | |||||
* again and not just committed. Since write clustering does | |||||
* not work for the stage 1 data write, only the stage 2 | |||||
* commit rpc, we have to clear B_CLUSTEROK as well. | |||||
*/ | |||||
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |||||
if (err) { | if (err) { | ||||
bp->b_ioflags |= BIO_ERROR; | bp->b_ioflags |= BIO_ERROR; | ||||
bp->b_error = err; | bp->b_error = err; | ||||
brelse(bp); | brelse(bp); | ||||
break; | break; | ||||
/* TODO: vfs_bio_clrbuf like ffs_write does? */ | |||||
} | } | ||||
/* | /* | ||||
* Only update dirtyoff/dirtyend if not a degenerate | * Only update dirtyoff/dirtyend if not a degenerate | ||||
* condition. | * condition. | ||||
*/ | */ | ||||
if (n) { | if (n) { | ||||
if (bp->b_dirtyend > 0) { | if (bp->b_dirtyend > 0) { | ||||
bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); | bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); | ||||
bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); | bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); | ||||
} else { | } else { | ||||
bp->b_dirtyoff = on; | bp->b_dirtyoff = on; | ||||
bp->b_dirtyend = on + n; | bp->b_dirtyend = on + n; | ||||
} | } | ||||
vfs_bio_set_valid(bp, on, n); | vfs_bio_set_valid(bp, on, n); | ||||
} | } | ||||
vfs_bio_set_flags(bp, ioflag); | |||||
bp->b_flags |= B_FUSEFS_WRITE_CACHE; | |||||
if (ioflag & IO_SYNC) { | |||||
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 2, bp); | |||||
if (!(ioflag & IO_VMIO)) | |||||
bp->b_flags &= ~B_FUSEFS_WRITE_CACHE; | |||||
err = bwrite(bp); | err = bwrite(bp); | ||||
} else if (vm_page_count_severe() || | |||||
buf_dirty_count_severe() || | |||||
(ioflag & IO_ASYNC)) { | |||||
bp->b_flags |= B_CLUSTEROK; | |||||
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 3, bp); | |||||
bawrite(bp); | |||||
} else if (on == 0 && n == bcount) { | |||||
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { | |||||
bp->b_flags |= B_CLUSTEROK; | |||||
SDT_PROBE2(fusefs, , io, write_biobackend_issue, | |||||
4, bp); | |||||
cluster_write(vp, bp, filesize, seqcount, 0); | |||||
} else { | |||||
SDT_PROBE2(fusefs, , io, write_biobackend_issue, | |||||
5, bp); | |||||
bawrite(bp); | |||||
} | |||||
} else if (ioflag & IO_DIRECT) { | |||||
bp->b_flags |= B_CLUSTEROK; | |||||
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 6, bp); | |||||
bawrite(bp); | |||||
} else { | |||||
bp->b_flags &= ~B_CLUSTEROK; | |||||
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 7, bp); | |||||
bdwrite(bp); | |||||
} | |||||
if (err) | if (err) | ||||
break; | break; | ||||
} while (uio->uio_resid > 0 && n > 0); | } while (uio->uio_resid > 0 && n > 0); | ||||
if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0) | |||||
fuse_vnode_savesize(vp, cred); | |||||
return (err); | return (err); | ||||
} | } | ||||
int | int | ||||
fuse_io_strategy(struct vnode *vp, struct buf *bp) | fuse_io_strategy(struct vnode *vp, struct buf *bp) | ||||
{ | { | ||||
struct fuse_filehandle *fufh; | |||||
struct fuse_vnode_data *fvdat = VTOFUD(vp); | struct fuse_vnode_data *fvdat = VTOFUD(vp); | ||||
struct fuse_filehandle *fufh; | |||||
struct ucred *cred; | struct ucred *cred; | ||||
struct uio *uiop; | struct uio *uiop; | ||||
struct uio uio; | struct uio uio; | ||||
struct iovec io; | struct iovec io; | ||||
off_t filesize; | |||||
int error = 0; | int error = 0; | ||||
int fflag; | |||||
/* We don't know the true pid when we're dealing with the cache */ | |||||
pid_t pid = 0; | |||||
const int biosize = fuse_iosize(vp); | const int biosize = fuse_iosize(vp); | ||||
MPASS(vp->v_type == VREG || vp->v_type == VDIR); | MPASS(vp->v_type == VREG || vp->v_type == VDIR); | ||||
MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); | MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); | ||||
error = fuse_filehandle_getrw(vp, | fflag = bp->b_iocmd == BIO_READ ? FREAD : FWRITE; | ||||
(bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); | cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; | ||||
error = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); | |||||
if (bp->b_iocmd == BIO_READ && error == EBADF) { | |||||
/* | |||||
* This may be a read-modify-write operation on a cached file | |||||
* opened O_WRONLY. The FUSE protocol allows this. | |||||
*/ | |||||
error = fuse_filehandle_get(vp, FWRITE, &fufh, cred, pid); | |||||
} | |||||
if (error) { | if (error) { | ||||
printf("FUSE: strategy: filehandles are closed\n"); | printf("FUSE: strategy: filehandles are closed\n"); | ||||
bp->b_ioflags |= BIO_ERROR; | bp->b_ioflags |= BIO_ERROR; | ||||
bp->b_error = error; | bp->b_error = error; | ||||
bufdone(bp); | |||||
return (error); | return (error); | ||||
} | } | ||||
cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; | |||||
uiop = &uio; | uiop = &uio; | ||||
uiop->uio_iov = &io; | uiop->uio_iov = &io; | ||||
uiop->uio_iovcnt = 1; | uiop->uio_iovcnt = 1; | ||||
uiop->uio_segflg = UIO_SYSSPACE; | uiop->uio_segflg = UIO_SYSSPACE; | ||||
uiop->uio_td = curthread; | uiop->uio_td = curthread; | ||||
/* | /* | ||||
* clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We | * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We | ||||
* do this here so we do not have to do it in all the code that | * do this here so we do not have to do it in all the code that | ||||
* calls us. | * calls us. | ||||
*/ | */ | ||||
bp->b_flags &= ~B_INVAL; | bp->b_flags &= ~B_INVAL; | ||||
bp->b_ioflags &= ~BIO_ERROR; | bp->b_ioflags &= ~BIO_ERROR; | ||||
KASSERT(!(bp->b_flags & B_DONE), | KASSERT(!(bp->b_flags & B_DONE), | ||||
("fuse_io_strategy: bp %p already marked done", bp)); | ("fuse_io_strategy: bp %p already marked done", bp)); | ||||
if (bp->b_iocmd == BIO_READ) { | if (bp->b_iocmd == BIO_READ) { | ||||
ssize_t left; | |||||
io.iov_len = uiop->uio_resid = bp->b_bcount; | io.iov_len = uiop->uio_resid = bp->b_bcount; | ||||
io.iov_base = bp->b_data; | io.iov_base = bp->b_data; | ||||
uiop->uio_rw = UIO_READ; | uiop->uio_rw = UIO_READ; | ||||
uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; | uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize; | ||||
error = fuse_read_directbackend(vp, uiop, cred, fufh); | error = fuse_read_directbackend(vp, uiop, cred, fufh); | ||||
/* XXXCEM: Potentially invalid access to cached_attrs here */ | |||||
if ((!error && uiop->uio_resid) || | |||||
(fsess_opt_brokenio(vnode_mount(vp)) && error == EIO && | |||||
uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 && | |||||
uiop->uio_offset >= fvdat->cached_attrs.va_size)) { | |||||
/* | /* | ||||
* If we had a short read with no error, we must have | * Store the amount we failed to read in the buffer's private | ||||
* hit a file hole. We should zero-fill the remainder. | * field, so callers can truncate the file if necessary' | ||||
* This can also occur if the server hits the file EOF. | |||||
* | |||||
* Holes used to be able to occur due to pending | |||||
* writes, but that is not possible any longer. | |||||
*/ | */ | ||||
int nread = bp->b_bcount - uiop->uio_resid; | |||||
int left = uiop->uio_resid; | |||||
if (error != 0) { | if (!error && uiop->uio_resid) { | ||||
printf("FUSE: Fix broken io: offset %ju, " | int nread = bp->b_bcount - uiop->uio_resid; | ||||
" resid %zd, file size %ju/%ju\n", | left = uiop->uio_resid; | ||||
(uintmax_t)uiop->uio_offset, | |||||
uiop->uio_resid, fvdat->filesize, | |||||
fvdat->cached_attrs.va_size); | |||||
error = 0; | |||||
} | |||||
if (left > 0) | |||||
bzero((char *)bp->b_data + nread, left); | bzero((char *)bp->b_data + nread, left); | ||||
if ((fvdat->flag & FN_SIZECHANGE) == 0) { | |||||
/* | |||||
* A short read with no error, when not using | |||||
* direct io, and when no writes are cached, | |||||
* indicates EOF caused by a server-side | |||||
* truncation. Clear the attr cache so we'll | |||||
* pick up the new file size and timestamps. | |||||
* | |||||
* We must still bzero the remaining buffer so | |||||
* uninitialized data doesn't get exposed by a | |||||
* future truncate that extends the file. | |||||
* | |||||
* To prevent lock order problems, we must | |||||
* truncate the file upstack, not here. | |||||
*/ | |||||
SDT_PROBE2(fusefs, , io, trace, 1, | |||||
"Short read of a clean file"); | |||||
fuse_vnode_clear_attr_cache(vp); | |||||
} else { | |||||
/* | |||||
* If dirty writes _are_ cached beyond EOF, | |||||
* that indicates a newly created hole that the | |||||
* server doesn't know about. Those don't pose | |||||
* any problem. | |||||
* XXX: we don't currently track whether dirty | |||||
* writes are cached beyond EOF, before EOF, or | |||||
* both. | |||||
*/ | |||||
SDT_PROBE2(fusefs, , io, trace, 1, | |||||
"Short read of a dirty file"); | |||||
uiop->uio_resid = 0; | uiop->uio_resid = 0; | ||||
} | } | ||||
} | |||||
if (error) { | if (error) { | ||||
bp->b_ioflags |= BIO_ERROR; | bp->b_ioflags |= BIO_ERROR; | ||||
bp->b_error = error; | bp->b_error = error; | ||||
} | } | ||||
} else { | } else { | ||||
/* | /* | ||||
* If we only need to commit, try to commit | |||||
*/ | |||||
if (bp->b_flags & B_NEEDCOMMIT) { | |||||
SDT_PROBE2(fuse, , io, trace, 1, | |||||
"write: B_NEEDCOMMIT flags set"); | |||||
} | |||||
/* | |||||
* Setup for actual write | * Setup for actual write | ||||
*/ | */ | ||||
if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > | error = fuse_vnode_size(vp, &filesize, cred, curthread); | ||||
fvdat->filesize) | if (error) { | ||||
bp->b_dirtyend = fvdat->filesize - | bp->b_ioflags |= BIO_ERROR; | ||||
(off_t)bp->b_blkno * biosize; | bp->b_error = error; | ||||
bufdone(bp); | |||||
return (error); | |||||
} | |||||
if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize) | |||||
bp->b_dirtyend = filesize - | |||||
(off_t)bp->b_lblkno * biosize; | |||||
if (bp->b_dirtyend > bp->b_dirtyoff) { | if (bp->b_dirtyend > bp->b_dirtyoff) { | ||||
io.iov_len = uiop->uio_resid = bp->b_dirtyend | io.iov_len = uiop->uio_resid = bp->b_dirtyend | ||||
- bp->b_dirtyoff; | - bp->b_dirtyoff; | ||||
uiop->uio_offset = (off_t)bp->b_blkno * biosize | uiop->uio_offset = (off_t)bp->b_lblkno * biosize | ||||
+ bp->b_dirtyoff; | + bp->b_dirtyoff; | ||||
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; | io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; | ||||
uiop->uio_rw = UIO_WRITE; | uiop->uio_rw = UIO_WRITE; | ||||
error = fuse_write_directbackend(vp, uiop, cred, fufh, 0); | bool pages = bp->b_flags & B_FUSEFS_WRITE_CACHE; | ||||
error = fuse_write_directbackend(vp, uiop, cred, fufh, | |||||
filesize, 0, pages); | |||||
if (error == EINTR || error == ETIMEDOUT | if (error == EINTR || error == ETIMEDOUT) { | ||||
|| (!error && (bp->b_flags & B_NEEDCOMMIT))) { | |||||
bp->b_flags &= ~(B_INVAL | B_NOCACHE); | bp->b_flags &= ~(B_INVAL | B_NOCACHE); | ||||
if ((bp->b_flags & B_PAGING) == 0) { | if ((bp->b_flags & B_PAGING) == 0) { | ||||
bdirty(bp); | bdirty(bp); | ||||
bp->b_flags &= ~B_DONE; | bp->b_flags &= ~B_DONE; | ||||
} | } | ||||
if ((error == EINTR || error == ETIMEDOUT) && | if ((error == EINTR || error == ETIMEDOUT) && | ||||
(bp->b_flags & B_ASYNC) == 0) | (bp->b_flags & B_ASYNC) == 0) | ||||
bp->b_flags |= B_EINTR; | bp->b_flags |= B_EINTR; | ||||
▲ Show 20 Lines • Show All 85 Lines • Show Last 20 Lines |