Index: sys/fs/fuse/fuse_io.h =================================================================== --- sys/fs/fuse/fuse_io.h +++ sys/fs/fuse/fuse_io.h @@ -65,10 +65,17 @@ #ifndef _FUSE_IO_H_ #define _FUSE_IO_H_ -int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, - struct ucred *cred, pid_t pid); int fuse_io_strategy(struct vnode *vp, struct buf *bp); int fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td); int fuse_io_invalbuf(struct vnode *vp, struct thread *td); +int fuse_read_directbackend(struct vnode *vp, struct uio *uio, + struct ucred *cred, struct fuse_filehandle *fufh); +int fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, + struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid); +int fuse_write_directbackend(struct vnode *vp, struct uio *uio, + struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, + int ioflag, bool pages); +int fuse_write_biobackend(struct vnode *vp, struct uio *uio, + struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid); #endif /* _FUSE_IO_H_ */ Index: sys/fs/fuse/fuse_io.c =================================================================== --- sys/fs/fuse/fuse_io.c +++ sys/fs/fuse/fuse_io.c @@ -119,184 +119,11 @@ */ SDT_PROBE_DEFINE2(fusefs, , io, trace, "int", "char*"); -static int -fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end); -static int -fuse_read_directbackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh); -static int -fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, - struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid); -static int -fuse_write_directbackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, - int ioflag, bool pages); -static int -fuse_write_biobackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid); - -/* Invalidate a range of cached data, whether dirty of not */ -static int -fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end) -{ - struct buf *bp; - daddr_t left_lbn, end_lbn, right_lbn; - off_t new_filesize; - int iosize, left_on, right_on, right_blksize; - - iosize = fuse_iosize(vp); - left_lbn = start / iosize; - end_lbn = howmany(end, iosize); - left_on = start & (iosize - 1); - if (left_on != 0) { - bp = getblk(vp, left_lbn, iosize, PCATCH, 0, 0); - if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyend >= left_on) { - /* - * Flush the dirty buffer, because we don't have a - * byte-granular way to record which parts of the - * buffer are valid. - */ - bwrite(bp); - if (bp->b_error) - return (bp->b_error); - } else { - brelse(bp); - } - } - right_on = end & (iosize - 1); - if (right_on != 0) { - right_lbn = end / iosize; - new_filesize = MAX(filesize, end); - right_blksize = MIN(iosize, new_filesize - iosize * right_lbn); - bp = getblk(vp, right_lbn, right_blksize, PCATCH, 0, 0); - if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyoff < right_on) { - /* - * Flush the dirty buffer, because we don't have a - * byte-granular way to record which parts of the - * buffer are valid. - */ - bwrite(bp); - if (bp->b_error) - return (bp->b_error); - } else { - brelse(bp); - } - } - - v_inval_buf_range(vp, left_lbn, end_lbn, iosize); - return (0); -} - -SDT_PROBE_DEFINE5(fusefs, , io, io_dispatch, "struct vnode*", "struct uio*", - "int", "struct ucred*", "struct fuse_filehandle*"); -SDT_PROBE_DEFINE4(fusefs, , io, io_dispatch_filehandles_closed, "struct vnode*", - "struct uio*", "int", "struct ucred*"); -int -fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, - struct ucred *cred, pid_t pid) -{ - struct fuse_filehandle *fufh; - int err, directio; - int fflag; - bool closefufh = false; - - MPASS(vp->v_type == VREG || vp->v_type == VDIR); - - fflag = (uio->uio_rw == UIO_READ) ? FREAD : FWRITE; - err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); - if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { - /* - * nfsd will do I/O without first doing VOP_OPEN. We - * must implicitly open the file here - */ - err = fuse_filehandle_open(vp, fflag, &fufh, curthread, cred); - closefufh = true; - } - else if (err) { - SDT_PROBE4(fusefs, , io, io_dispatch_filehandles_closed, - vp, uio, ioflag, cred); - printf("FUSE: io dispatch: filehandles are closed\n"); - return err; - } - if (err) - goto out; - SDT_PROBE5(fusefs, , io, io_dispatch, vp, uio, ioflag, cred, fufh); - - /* - * Ideally, when the daemon asks for direct io at open time, the - * standard file flag should be set according to this, so that would - * just change the default mode, which later on could be changed via - * fcntl(2). - * But this doesn't work, the O_DIRECT flag gets cleared at some point - * (don't know where). So to make any use of the Fuse direct_io option, - * we hardwire it into the file's private data (similarly to Linux, - * btw.). - */ - directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); - - switch (uio->uio_rw) { - case UIO_READ: - fuse_vnode_update(vp, FN_ATIMECHANGE); - if (directio) { - SDT_PROBE2(fusefs, , io, trace, 1, - "direct read of vnode"); - err = fuse_read_directbackend(vp, uio, cred, fufh); - } else { - SDT_PROBE2(fusefs, , io, trace, 1, - "buffered read of vnode"); - err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh, - pid); - } - break; - case UIO_WRITE: - fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); - if (directio) { - off_t start, end, filesize; - bool pages = (ioflag & IO_VMIO) != 0; - - SDT_PROBE2(fusefs, , io, trace, 1, - "direct write of vnode"); - - err = fuse_vnode_size(vp, &filesize, cred, curthread); - if (err) - goto out; - - start = uio->uio_offset; - end = start + uio->uio_resid; - if (!pages) { - err = fuse_inval_buf_range(vp, filesize, start, - end); - if (err) - return (err); - } - err = fuse_write_directbackend(vp, uio, cred, fufh, - filesize, ioflag, pages); - } else { - SDT_PROBE2(fusefs, , io, trace, 1, - "buffered write of vnode"); - if (!fsess_opt_writeback(vnode_mount(vp))) - ioflag |= IO_SYNC; - err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag, - pid); - } - fuse_internal_clear_suid_on_write(vp, cred, uio->uio_td); - break; - default: - panic("uninterpreted mode passed to fuse_io_dispatch"); - } - -out: - if (closefufh) - fuse_filehandle_close(vp, fufh, curthread, cred); - - return (err); -} - SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int"); SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "struct buf*"); SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int", "struct buf*"); -static int +int fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid) { @@ -402,7 +229,7 @@ SDT_PROBE_DEFINE3(fusefs, , io, read_directbackend_complete, "struct fuse_dispatcher*", "struct fuse_read_in*", "struct uio*"); -static int +int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh) { @@ -464,7 +291,7 @@ return (err); } -static int +int fuse_write_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, int ioflag, bool pages) @@ -627,7 +454,7 @@ SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_append_race, "long", "int"); SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_issue, "int", "struct buf*"); -static int +int fuse_write_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid) { Index: sys/fs/fuse/fuse_vnops.c =================================================================== --- sys/fs/fuse/fuse_vnops.c +++ sys/fs/fuse/fuse_vnops.c @@ -318,6 +318,59 @@ return (fifo_specops.vop_close(ap)); } +/* Invalidate a range of cached data, whether dirty of not */ +static int +fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end) +{ + struct buf *bp; + daddr_t left_lbn, end_lbn, right_lbn; + off_t new_filesize; + int iosize, left_on, right_on, right_blksize; + + iosize = fuse_iosize(vp); + left_lbn = start / iosize; + end_lbn = howmany(end, iosize); + left_on = start & (iosize - 1); + if (left_on != 0) { + bp = getblk(vp, left_lbn, iosize, PCATCH, 0, 0); + if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyend >= left_on) { + /* + * Flush the dirty buffer, because we don't have a + * byte-granular way to record which parts of the + * buffer are valid. + */ + bwrite(bp); + if (bp->b_error) + return (bp->b_error); + } else { + brelse(bp); + } + } + right_on = end & (iosize - 1); + if (right_on != 0) { + right_lbn = end / iosize; + new_filesize = MAX(filesize, end); + right_blksize = MIN(iosize, new_filesize - iosize * right_lbn); + bp = getblk(vp, right_lbn, right_blksize, PCATCH, 0, 0); + if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyoff < right_on) { + /* + * Flush the dirty buffer, because we don't have a + * byte-granular way to record which parts of the + * buffer are valid. + */ + bwrite(bp); + if (bp->b_error) + return (bp->b_error); + } else { + brelse(bp); + } + } + + v_inval_buf_range(vp, left_lbn, end_lbn, iosize); + return (0); +} + + /* Send FUSE_LSEEK for this node */ static int fuse_vnop_do_lseek(struct vnode *vp, struct thread *td, struct ucred *cred, @@ -667,6 +720,7 @@ struct fuse_write_out *fwo; struct thread *td; struct uio io; + off_t outfilesize; pid_t pid; int err; @@ -722,6 +776,15 @@ goto unlock; } + err = fuse_vnode_size(outvp, &outfilesize, outcred, curthread); + if (err) + goto unlock; + + err = fuse_inval_buf_range(outvp, outfilesize, *ap->a_outoffp, + *ap->a_outoffp + *ap->a_lenp); + if (err) + goto unlock; + fdisp_init(&fdi, sizeof(*fcfri)); fdisp_make_vp(&fdi, FUSE_COPY_FILE_RANGE, invp, td, incred); fcfri = fdi.indata; @@ -1587,6 +1650,8 @@ } } +SDT_PROBE_DEFINE3(fusefs, , vnops, filehandles_closed, "struct vnode*", + "struct uio*", "struct ucred*"); /* struct vnop_read_args { struct vnode *a_vp; @@ -1603,6 +1668,11 @@ int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; + struct fuse_filehandle *fufh; + int err; + bool closefufh = false, directio; + + MPASS(vp->v_type == VREG || vp->v_type == VDIR); if (fuse_isdeadfs(vp)) { return ENXIO; @@ -1612,7 +1682,45 @@ ioflag |= IO_DIRECT; } - return fuse_io_dispatch(vp, uio, ioflag, cred, pid); + err = fuse_filehandle_getrw(vp, FREAD, &fufh, cred, pid); + if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { + /* + * nfsd will do I/O without first doing VOP_OPEN. We + * must implicitly open the file here + */ + err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred); + closefufh = true; + } + if (err) { + SDT_PROBE3(fusefs, , vnops, filehandles_closed, vp, uio, cred); + return err; + } + + /* + * Ideally, when the daemon asks for direct io at open time, the + * standard file flag should be set according to this, so that would + * just change the default mode, which later on could be changed via + * fcntl(2). + * But this doesn't work, the O_DIRECT flag gets cleared at some point + * (don't know where). So to make any use of the Fuse direct_io option, + * we hardwire it into the file's private data (similarly to Linux, + * btw.). + */ + directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); + + fuse_vnode_update(vp, FN_ATIMECHANGE); + if (directio) { + SDT_PROBE2(fusefs, , vnops, trace, 1, "direct read of vnode"); + err = fuse_read_directbackend(vp, uio, cred, fufh); + } else { + SDT_PROBE2(fusefs, , vnops, trace, 1, "buffered read of vnode"); + err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh, pid); + } + + if (closefufh) + fuse_filehandle_close(vp, fufh, curthread, cred); + + return (err); } /* @@ -2165,6 +2273,11 @@ int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; + struct fuse_filehandle *fufh; + int err; + bool closefufh = false, directio; + + MPASS(vp->v_type == VREG || vp->v_type == VDIR); if (fuse_isdeadfs(vp)) { return ENXIO; @@ -2174,7 +2287,67 @@ ioflag |= IO_DIRECT; } - return fuse_io_dispatch(vp, uio, ioflag, cred, pid); + err = fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); + if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { + /* + * nfsd will do I/O without first doing VOP_OPEN. We + * must implicitly open the file here + */ + err = fuse_filehandle_open(vp, FWRITE, &fufh, curthread, cred); + closefufh = true; + } + if (err) { + SDT_PROBE3(fusefs, , vnops, filehandles_closed, vp, uio, cred); + return err; + } + + /* + * Ideally, when the daemon asks for direct io at open time, the + * standard file flag should be set according to this, so that would + * just change the default mode, which later on could be changed via + * fcntl(2). + * But this doesn't work, the O_DIRECT flag gets cleared at some point + * (don't know where). So to make any use of the Fuse direct_io option, + * we hardwire it into the file's private data (similarly to Linux, + * btw.). + */ + directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); + + fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); + if (directio) { + off_t start, end, filesize; + bool pages = (ioflag & IO_VMIO) != 0; + + SDT_PROBE2(fusefs, , vnops, trace, 1, "direct write of vnode"); + + err = fuse_vnode_size(vp, &filesize, cred, curthread); + if (err) + goto out; + + start = uio->uio_offset; + end = start + uio->uio_resid; + if (!pages) { + err = fuse_inval_buf_range(vp, filesize, start, + end); + if (err) + goto out; + } + err = fuse_write_directbackend(vp, uio, cred, fufh, + filesize, ioflag, pages); + } else { + SDT_PROBE2(fusefs, , vnops, trace, 1, + "buffered write of vnode"); + if (!fsess_opt_writeback(vnode_mount(vp))) + ioflag |= IO_SYNC; + err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag, pid); + } + fuse_internal_clear_suid_on_write(vp, cred, uio->uio_td); + +out: + if (closefufh) + fuse_filehandle_close(vp, fufh, curthread, cred); + + return (err); } static daddr_t Index: tests/sys/fs/fusefs/copy_file_range.cc =================================================================== --- tests/sys/fs/fusefs/copy_file_range.cc +++ tests/sys/fs/fusefs/copy_file_range.cc @@ -171,6 +171,84 @@ EXPECT_EQ(EIO, errno); } +/* + * copy_file_range should evict cached data for the modified region of the + * destination file. + */ +TEST_F(CopyFileRange, evicts_cache) +{ + const char FULLPATH1[] = "mountpoint/src.txt"; + const char RELPATH1[] = "src.txt"; + const char FULLPATH2[] = "mountpoint/dst.txt"; + const char RELPATH2[] = "dst.txt"; + void *buf0, *buf1, *buf; + const uint64_t ino1 = 42; + const uint64_t ino2 = 43; + const uint64_t fh1 = 0xdeadbeef1a7ebabe; + const uint64_t fh2 = 0xdeadc0de88c0ffee; + off_t fsize1 = 1 << 20; /* 1 MiB */ + off_t fsize2 = 1 << 19; /* 512 KiB */ + off_t start1 = 1 << 18; + off_t start2 = 3 << 17; + ssize_t len = m_maxbcachebuf; + int fd1, fd2; + + buf0 = malloc(m_maxbcachebuf); + memset(buf0, 42, m_maxbcachebuf); + + expect_lookup(RELPATH1, ino1, S_IFREG | 0644, fsize1, 1); + expect_lookup(RELPATH2, ino2, S_IFREG | 0644, fsize2, 1); + expect_open(ino1, 0, 1, fh1); + expect_open(ino2, 0, 1, fh2); + expect_read(ino2, start2, m_maxbcachebuf, m_maxbcachebuf, buf0, -1, + fh2); + EXPECT_CALL(*m_mock, process( + ResultOf([=](auto in) { + return (in.header.opcode == FUSE_COPY_FILE_RANGE && + in.header.nodeid == ino1 && + in.body.copy_file_range.fh_in == fh1 && + (off_t)in.body.copy_file_range.off_in == start1 && + in.body.copy_file_range.nodeid_out == ino2 && + in.body.copy_file_range.fh_out == fh2 && + (off_t)in.body.copy_file_range.off_out == start2 && + in.body.copy_file_range.len == (size_t)len && + in.body.copy_file_range.flags == 0); + }, Eq(true)), + _) + ).WillOnce(Invoke(ReturnImmediate([=](auto in __unused, auto& out) { + SET_OUT_HEADER_LEN(out, write); + out.body.write.size = len; + }))); + + fd1 = open(FULLPATH1, O_RDONLY); + fd2 = open(FULLPATH2, O_RDWR); + + // Prime cache + buf = malloc(m_maxbcachebuf); + ASSERT_EQ(m_maxbcachebuf, pread(fd2, buf, m_maxbcachebuf, start2)) + << strerror(errno); + EXPECT_EQ(0, memcmp(buf0, buf, m_maxbcachebuf)); + + // Tell the FUSE server overwrite the region we just read + ASSERT_EQ(len, copy_file_range(fd1, &start1, fd2, &start2, len, 0)); + + // Read again. This should bypass the cache and read direct from server + buf1 = malloc(m_maxbcachebuf); + memset(buf1, 69, m_maxbcachebuf); + start2 -= len; + expect_read(ino2, start2, m_maxbcachebuf, m_maxbcachebuf, buf1, -1, + fh2); + ASSERT_EQ(m_maxbcachebuf, pread(fd2, buf, m_maxbcachebuf, start2)) + << strerror(errno); + EXPECT_EQ(0, memcmp(buf1, buf, m_maxbcachebuf)); + + free(buf1); + free(buf0); + free(buf); + leak(fd1); + leak(fd2); +} + /* * If the server doesn't support FUSE_COPY_FILE_RANGE, the kernel should * fallback to a read/write based implementation. Index: tests/sys/fs/fusefs/utils.hh =================================================================== --- tests/sys/fs/fusefs/utils.hh +++ tests/sys/fs/fusefs/utils.hh @@ -175,7 +175,8 @@ * nothing currently validates the size of the fuse_read_in struct. */ void expect_read(uint64_t ino, uint64_t offset, uint64_t isize, - uint64_t osize, const void *contents, int flags = -1); + uint64_t osize, const void *contents, int flags = -1, + uint64_t fh = FH); /* * Create an expectation that FUSE_READIR will be called any number of Index: tests/sys/fs/fusefs/utils.cc =================================================================== --- tests/sys/fs/fusefs/utils.cc +++ tests/sys/fs/fusefs/utils.cc @@ -367,13 +367,13 @@ } void FuseTest::expect_read(uint64_t ino, uint64_t offset, uint64_t isize, - uint64_t osize, const void *contents, int flags) + uint64_t osize, const void *contents, int flags, uint64_t fh) { EXPECT_CALL(*m_mock, process( ResultOf([=](auto in) { return (in.header.opcode == FUSE_READ && in.header.nodeid == ino && - in.body.read.fh == FH && + in.body.read.fh == fh && in.body.read.offset == offset && in.body.read.size == isize && (flags == -1 ?