diff --git a/sys/fs/fuse/fuse_internal.c b/sys/fs/fuse/fuse_internal.c index d17a315c3cbd..cd24acd40355 100644 --- a/sys/fs/fuse/fuse_internal.c +++ b/sys/fs/fuse/fuse_internal.c @@ -1,1097 +1,1097 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" #include "fuse_io.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_file.h" SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fusefs, , internal, trace, "int", "char*"); #ifdef ZERO_PAD_INCOMPLETE_BUFS static int isbzero(void *buf, size_t len); #endif int fuse_internal_get_cached_vnode(struct mount* mp, ino_t ino, int flags, struct vnode **vpp) { struct bintime now; struct thread *td = curthread; uint64_t nodeid = ino; int error; *vpp = NULL; error = vfs_hash_get(mp, fuse_vnode_hash(nodeid), flags, td, vpp, fuse_vnode_cmp, &nodeid); if (error) return error; /* * Check the entry cache timeout. We have to do this within fusefs * instead of by using cache_enter_time/cache_lookup because those * routines are only intended to work with pathnames, not inodes */ if (*vpp != NULL) { getbinuptime(&now); if (bintime_cmp(&(VTOFUD(*vpp)->entry_cache_timeout), &now, >)){ atomic_add_acq_long(&fuse_lookup_cache_hits, 1); return 0; } else { /* Entry cache timeout */ atomic_add_acq_long(&fuse_lookup_cache_misses, 1); cache_purge(*vpp); vput(*vpp); *vpp = NULL; } } return 0; } /* Synchronously send a FUSE_ACCESS operation */ int fuse_internal_access(struct vnode *vp, accmode_t mode, struct thread *td, struct ucred *cred) { int err = 0; uint32_t mask = F_OK; int dataflags; int vtype; struct mount *mp; struct fuse_dispatcher fdi; struct fuse_access_in *fai; struct fuse_data *data; mp = vnode_mount(vp); vtype = vnode_vtype(vp); data = fuse_get_mpdata(mp); dataflags = data->dataflags; if (mode == 0) return 0; if (mode & VMODIFY_PERMS && vfs_isrdonly(mp)) { switch (vp->v_type) { case VDIR: /* FALLTHROUGH */ case VLNK: /* FALLTHROUGH */ case VREG: return EROFS; default: break; } } /* Unless explicitly permitted, deny everyone except the fs owner. */ if (!(dataflags & FSESS_DAEMON_CAN_SPY)) { if (fuse_match_cred(data->daemoncred, cred)) return EPERM; } if (dataflags & FSESS_DEFAULT_PERMISSIONS) { struct vattr va; fuse_internal_getattr(vp, &va, cred, td); return vaccess(vp->v_type, va.va_mode, va.va_uid, va.va_gid, mode, cred, NULL); } if (!fsess_isimpl(mp, FUSE_ACCESS)) return 0; if ((mode & (VWRITE | VAPPEND | VADMIN)) != 0) mask |= W_OK; if ((mode & VREAD) != 0) mask |= R_OK; if ((mode & VEXEC) != 0) mask |= X_OK; fdisp_init(&fdi, sizeof(*fai)); fdisp_make_vp(&fdi, FUSE_ACCESS, vp, td, cred); fai = fdi.indata; fai->mask = mask; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_ACCESS); err = 0; } return err; } /* * Cache FUSE attributes from attr, in attribute cache associated with vnode * 'vp'. Optionally, if argument 'vap' is not NULL, store a copy of the * converted attributes there as well. * * If the nominal attribute cache TTL is zero, do not cache on the 'vp' (but do * return the result to the caller). */ void fuse_internal_cache_attrs(struct vnode *vp, struct fuse_attr *attr, uint64_t attr_valid, uint32_t attr_valid_nsec, struct vattr *vap) { struct mount *mp; struct fuse_vnode_data *fvdat; struct fuse_data *data; struct vattr *vp_cache_at; mp = vnode_mount(vp); fvdat = VTOFUD(vp); data = fuse_get_mpdata(mp); ASSERT_VOP_ELOCKED(vp, "fuse_internal_cache_attrs"); fuse_validity_2_bintime(attr_valid, attr_valid_nsec, &fvdat->attr_cache_timeout); /* Fix our buffers if the filesize changed without us knowing */ if (vnode_isreg(vp) && attr->size != fvdat->cached_attrs.va_size) { (void)fuse_vnode_setsize(vp, attr->size); fvdat->cached_attrs.va_size = attr->size; } if (attr_valid > 0 || attr_valid_nsec > 0) vp_cache_at = &(fvdat->cached_attrs); else if (vap != NULL) vp_cache_at = vap; else return; vattr_null(vp_cache_at); vp_cache_at->va_fsid = mp->mnt_stat.f_fsid.val[0]; vp_cache_at->va_fileid = attr->ino; vp_cache_at->va_mode = attr->mode & ~S_IFMT; vp_cache_at->va_nlink = attr->nlink; vp_cache_at->va_uid = attr->uid; vp_cache_at->va_gid = attr->gid; vp_cache_at->va_rdev = attr->rdev; vp_cache_at->va_size = attr->size; /* XXX on i386, seconds are truncated to 32 bits */ vp_cache_at->va_atime.tv_sec = attr->atime; vp_cache_at->va_atime.tv_nsec = attr->atimensec; vp_cache_at->va_mtime.tv_sec = attr->mtime; vp_cache_at->va_mtime.tv_nsec = attr->mtimensec; vp_cache_at->va_ctime.tv_sec = attr->ctime; vp_cache_at->va_ctime.tv_nsec = attr->ctimensec; if (fuse_libabi_geq(data, 7, 9) && attr->blksize > 0) vp_cache_at->va_blocksize = attr->blksize; else vp_cache_at->va_blocksize = PAGE_SIZE; vp_cache_at->va_type = IFTOVT(attr->mode); vp_cache_at->va_bytes = attr->blocks * S_BLKSIZE; vp_cache_at->va_flags = 0; if (vap != vp_cache_at && vap != NULL) memcpy(vap, vp_cache_at, sizeof(*vap)); } /* fsync */ int fuse_internal_fsync_callback(struct fuse_ticket *tick, struct uio *uio) { if (tick->tk_aw_ohead.error == ENOSYS) { fsess_set_notimpl(tick->tk_data->mp, fticket_opcode(tick)); } return 0; } int fuse_internal_fsync(struct vnode *vp, struct thread *td, int waitfor, bool datasync) { struct fuse_fsync_in *ffsi = NULL; struct fuse_dispatcher fdi; struct fuse_filehandle *fufh; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct mount *mp = vnode_mount(vp); int op = FUSE_FSYNC; int err = 0; if (!fsess_isimpl(vnode_mount(vp), (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) { return 0; } if (vnode_isdir(vp)) op = FUSE_FSYNCDIR; if (!fsess_isimpl(mp, op)) return 0; fdisp_init(&fdi, sizeof(*ffsi)); /* * fsync every open file handle for this file, because we can't be sure * which file handle the caller is really referring to. */ LIST_FOREACH(fufh, &fvdat->handles, next) { if (ffsi == NULL) fdisp_make_vp(&fdi, op, vp, td, NULL); else fdisp_refresh_vp(&fdi, op, vp, td, NULL); ffsi = fdi.indata; ffsi->fh = fufh->fh_id; ffsi->fsync_flags = 0; if (datasync) ffsi->fsync_flags = 1; if (waitfor == MNT_WAIT) { err = fdisp_wait_answ(&fdi); } else { fuse_insert_callback(fdi.tick, fuse_internal_fsync_callback); fuse_insert_message(fdi.tick, false); } if (err == ENOSYS) { /* ENOSYS means "success, and don't call again" */ fsess_set_notimpl(mp, op); err = 0; break; } } fdisp_destroy(&fdi); return err; } /* Asynchronous invalidation */ SDT_PROBE_DEFINE2(fusefs, , internal, invalidate_cache_hit, "struct vnode*", "struct vnode*"); int fuse_internal_invalidate_entry(struct mount *mp, struct uio *uio) { struct fuse_notify_inval_entry_out fnieo; struct componentname cn; struct vnode *dvp, *vp; char name[PATH_MAX]; int err; if ((err = uiomove(&fnieo, sizeof(fnieo), uio)) != 0) return (err); if ((err = uiomove(name, fnieo.namelen, uio)) != 0) return (err); name[fnieo.namelen] = '\0'; /* fusefs does not cache "." or ".." entries */ if (strncmp(name, ".", sizeof(".")) == 0 || strncmp(name, "..", sizeof("..")) == 0) return (0); if (fnieo.parent == FUSE_ROOT_ID) err = VFS_ROOT(mp, LK_SHARED, &dvp); else err = fuse_internal_get_cached_vnode( mp, fnieo.parent, LK_SHARED, &dvp); /* * If dvp is not in the cache, then it must've been reclaimed. And * since fuse_vnop_reclaim does a cache_purge, name's entry must've * been invalidated already. So we can safely return if dvp == NULL */ if (err != 0 || dvp == NULL) return (err); /* * XXX we can't check dvp's generation because the FUSE invalidate * entry message doesn't include it. Worse case is that we invalidate * an entry that didn't need to be invalidated. */ cn.cn_nameiop = LOOKUP; cn.cn_flags = 0; /* !MAKEENTRY means free cached entry */ cn.cn_thread = curthread; cn.cn_cred = curthread->td_ucred; cn.cn_lkflags = LK_SHARED; cn.cn_pnbuf = NULL; cn.cn_nameptr = name; cn.cn_namelen = fnieo.namelen; err = cache_lookup(dvp, &vp, &cn, NULL, NULL); MPASS(err == 0); fuse_vnode_clear_attr_cache(dvp); vput(dvp); return (0); } int fuse_internal_invalidate_inode(struct mount *mp, struct uio *uio) { struct fuse_notify_inval_inode_out fniio; struct vnode *vp; int err; if ((err = uiomove(&fniio, sizeof(fniio), uio)) != 0) return (err); if (fniio.ino == FUSE_ROOT_ID) err = VFS_ROOT(mp, LK_EXCLUSIVE, &vp); else err = fuse_internal_get_cached_vnode(mp, fniio.ino, LK_SHARED, &vp); if (err != 0 || vp == NULL) return (err); /* * XXX we can't check vp's generation because the FUSE invalidate * entry message doesn't include it. Worse case is that we invalidate * an inode that didn't need to be invalidated. */ /* * Flush and invalidate buffers if off >= 0. Technically we only need * to flush and invalidate the range of offsets [off, off + len), but * for simplicity's sake we do everything. */ if (fniio.off >= 0) fuse_io_invalbuf(vp, curthread); fuse_vnode_clear_attr_cache(vp); vput(vp); return (0); } /* mknod */ int fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap) { struct fuse_data *data; struct fuse_mknod_in fmni; size_t insize; data = fuse_get_mpdata(dvp->v_mount); fmni.mode = MAKEIMODE(vap->va_type, vap->va_mode); fmni.rdev = vap->va_rdev; if (fuse_libabi_geq(data, 7, 12)) { insize = sizeof(fmni); fmni.umask = curthread->td_proc->p_fd->fd_cmask; } else { insize = FUSE_COMPAT_MKNOD_IN_SIZE; } return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKNOD, &fmni, insize, vap->va_type)); } /* readdir */ int fuse_internal_readdir(struct vnode *vp, struct uio *uio, off_t startoff, struct fuse_filehandle *fufh, struct fuse_iov *cookediov, int *ncookies, u_long *cookies) { int err = 0; struct fuse_dispatcher fdi; struct fuse_read_in *fri = NULL; int fnd_start; if (uio_resid(uio) == 0) return 0; fdisp_init(&fdi, 0); /* * Note that we DO NOT have a UIO_SYSSPACE here (so no need for p2p * I/O). */ /* * fnd_start is set non-zero once the offset in the directory gets * to the startoff. This is done because directories must be read * from the beginning (offset == 0) when fuse_vnop_readdir() needs * to do an open of the directory. * If it is not set non-zero here, it will be set non-zero in * fuse_internal_readdir_processdata() when uio_offset == startoff. */ fnd_start = 0; if (uio->uio_offset == startoff) fnd_start = 1; while (uio_resid(uio) > 0) { fdi.iosize = sizeof(*fri); if (fri == NULL) fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL); else fdisp_refresh_vp(&fdi, FUSE_READDIR, vp, NULL, NULL); fri = fdi.indata; fri->fh = fufh->fh_id; fri->offset = uio_offset(uio); fri->size = MIN(uio->uio_resid, fuse_get_mpdata(vp->v_mount)->max_read); if ((err = fdisp_wait_answ(&fdi))) break; if ((err = fuse_internal_readdir_processdata(uio, startoff, &fnd_start, fri->size, fdi.answ, fdi.iosize, cookediov, ncookies, &cookies))) break; } fdisp_destroy(&fdi); return ((err == -1) ? 0 : err); } /* * Return -1 to indicate that this readdir is finished, 0 if it copied * all the directory data read in and it may be possible to read more * and greater than 0 for a failure. */ int fuse_internal_readdir_processdata(struct uio *uio, off_t startoff, int *fnd_start, size_t reqsize, void *buf, size_t bufsize, struct fuse_iov *cookediov, int *ncookies, u_long **cookiesp) { int err = 0; int bytesavail; size_t freclen; struct dirent *de; struct fuse_dirent *fudge; u_long *cookies; cookies = *cookiesp; if (bufsize < FUSE_NAME_OFFSET) return -1; for (;;) { if (bufsize < FUSE_NAME_OFFSET) { err = -1; break; } fudge = (struct fuse_dirent *)buf; freclen = FUSE_DIRENT_SIZE(fudge); if (bufsize < freclen) { /* * This indicates a partial directory entry at the * end of the directory data. */ err = -1; break; } #ifdef ZERO_PAD_INCOMPLETE_BUFS if (isbzero(buf, FUSE_NAME_OFFSET)) { err = -1; break; } #endif if (!fudge->namelen || fudge->namelen > MAXNAMLEN) { err = EINVAL; break; } bytesavail = GENERIC_DIRSIZ((struct pseudo_dirent *) &fudge->namelen); if (bytesavail > uio_resid(uio)) { /* Out of space for the dir so we are done. */ err = -1; break; } /* * Don't start to copy the directory entries out until * the requested offset in the directory is found. */ if (*fnd_start != 0) { fiov_adjust(cookediov, bytesavail); bzero(cookediov->base, bytesavail); de = (struct dirent *)cookediov->base; de->d_fileno = fudge->ino; de->d_reclen = bytesavail; de->d_type = fudge->type; de->d_namlen = fudge->namelen; memcpy((char *)cookediov->base + sizeof(struct dirent) - MAXNAMLEN - 1, (char *)buf + FUSE_NAME_OFFSET, fudge->namelen); dirent_terminate(de); err = uiomove(cookediov->base, cookediov->len, uio); if (err) break; if (cookies != NULL) { if (*ncookies == 0) { err = -1; break; } *cookies = fudge->off; cookies++; (*ncookies)--; } } else if (startoff == fudge->off) *fnd_start = 1; buf = (char *)buf + freclen; bufsize -= freclen; uio_setoffset(uio, fudge->off); } *cookiesp = cookies; return err; } /* remove */ int fuse_internal_remove(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, enum fuse_opcode op) { struct fuse_dispatcher fdi; int err = 0; fdisp_init(&fdi, cnp->cn_namelen + 1); fdisp_make_vp(&fdi, op, dvp, cnp->cn_thread, cnp->cn_cred); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); return err; } /* rename */ int fuse_internal_rename(struct vnode *fdvp, struct componentname *fcnp, struct vnode *tdvp, struct componentname *tcnp) { struct fuse_dispatcher fdi; struct fuse_rename_in *fri; int err = 0; fdisp_init(&fdi, sizeof(*fri) + fcnp->cn_namelen + tcnp->cn_namelen + 2); fdisp_make_vp(&fdi, FUSE_RENAME, fdvp, tcnp->cn_thread, tcnp->cn_cred); fri = fdi.indata; fri->newdir = VTOI(tdvp); memcpy((char *)fdi.indata + sizeof(*fri), fcnp->cn_nameptr, fcnp->cn_namelen); ((char *)fdi.indata)[sizeof(*fri) + fcnp->cn_namelen] = '\0'; memcpy((char *)fdi.indata + sizeof(*fri) + fcnp->cn_namelen + 1, tcnp->cn_nameptr, tcnp->cn_namelen); ((char *)fdi.indata)[sizeof(*fri) + fcnp->cn_namelen + tcnp->cn_namelen + 1] = '\0'; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); return err; } /* strategy */ /* entity creation */ void fuse_internal_newentry_makerequest(struct mount *mp, uint64_t dnid, struct componentname *cnp, enum fuse_opcode op, void *buf, size_t bufsize, struct fuse_dispatcher *fdip) { fdip->iosize = bufsize + cnp->cn_namelen + 1; fdisp_make(fdip, op, mp, dnid, cnp->cn_thread, cnp->cn_cred); memcpy(fdip->indata, buf, bufsize); memcpy((char *)fdip->indata + bufsize, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[bufsize + cnp->cn_namelen] = '\0'; } int fuse_internal_newentry_core(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, enum vtype vtyp, struct fuse_dispatcher *fdip) { int err = 0; struct fuse_entry_out *feo; struct mount *mp = vnode_mount(dvp); if ((err = fdisp_wait_answ(fdip))) { return err; } feo = fdip->answ; if ((err = fuse_internal_checkentry(feo, vtyp))) { return err; } err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vtyp); if (err) { fuse_internal_forget_send(mp, cnp->cn_thread, cnp->cn_cred, feo->nodeid, 1); return err; } /* * Purge the parent's attribute cache because the daemon should've * updated its mtime and ctime */ fuse_vnode_clear_attr_cache(dvp); fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); return err; } int fuse_internal_newentry(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, enum fuse_opcode op, void *buf, size_t bufsize, enum vtype vtype) { int err; struct fuse_dispatcher fdi; struct mount *mp = vnode_mount(dvp); fdisp_init(&fdi, 0); fuse_internal_newentry_makerequest(mp, VTOI(dvp), cnp, op, buf, bufsize, &fdi); err = fuse_internal_newentry_core(dvp, vpp, cnp, vtype, &fdi); fdisp_destroy(&fdi); return err; } /* entity destruction */ int fuse_internal_forget_callback(struct fuse_ticket *ftick, struct uio *uio) { fuse_internal_forget_send(ftick->tk_data->mp, curthread, NULL, ((struct fuse_in_header *)ftick->tk_ms_fiov.base)->nodeid, 1); return 0; } void fuse_internal_forget_send(struct mount *mp, struct thread *td, struct ucred *cred, uint64_t nodeid, uint64_t nlookup) { struct fuse_dispatcher fdi; struct fuse_forget_in *ffi; /* * KASSERT(nlookup > 0, ("zero-times forget for vp #%llu", * (long long unsigned) nodeid)); */ fdisp_init(&fdi, sizeof(*ffi)); fdisp_make(&fdi, FUSE_FORGET, mp, nodeid, td, cred); ffi = fdi.indata; ffi->nlookup = nlookup; fuse_insert_message(fdi.tick, false); fdisp_destroy(&fdi); } /* Fetch the vnode's attributes from the daemon*/ int fuse_internal_do_getattr(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { struct fuse_dispatcher fdi; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_getattr_in *fgai; struct fuse_attr_out *fao; off_t old_filesize = fvdat->cached_attrs.va_size; enum vtype vtyp; int err; fdisp_init(&fdi, 0); fdisp_make_vp(&fdi, FUSE_GETATTR, vp, td, cred); fgai = fdi.indata; /* * We could look up a file handle and set it in fgai->fh, but that * involves extra runtime work and I'm unaware of any file systems that * care. */ fgai->getattr_flags = 0; if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) { if (err == ENOENT) fuse_internal_vnode_disappear(vp); goto out; } fao = (struct fuse_attr_out *)fdi.answ; vtyp = IFTOVT(fao->attr.mode); if (fvdat->flag & FN_SIZECHANGE) fao->attr.size = old_filesize; fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, fao->attr_valid_nsec, vap); if (vtyp != vnode_vtype(vp)) { fuse_internal_vnode_disappear(vp); err = ENOENT; } out: fdisp_destroy(&fdi); return err; } /* Read a vnode's attributes from cache or fetch them from the fuse daemon */ int fuse_internal_getattr(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { struct vattr *attrs; if ((attrs = VTOVA(vp)) != NULL) { *vap = *attrs; /* struct copy */ return 0; } return fuse_internal_do_getattr(vp, vap, cred, td); } void fuse_internal_vnode_disappear(struct vnode *vp) { struct fuse_vnode_data *fvdat = VTOFUD(vp); ASSERT_VOP_ELOCKED(vp, "fuse_internal_vnode_disappear"); fvdat->flag |= FN_REVOKED; bintime_clear(&fvdat->attr_cache_timeout); bintime_clear(&fvdat->entry_cache_timeout); cache_purge(vp); } /* fuse start/stop */ int fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio) { int err = 0; struct fuse_data *data = tick->tk_data; struct fuse_init_out *fiio; if ((err = tick->tk_aw_ohead.error)) { goto out; } if ((err = fticket_pull(tick, uio))) { goto out; } fiio = fticket_resp(tick)->base; /* XXX: Do we want to check anything further besides this? */ if (fiio->major < 7) { SDT_PROBE2(fusefs, , internal, trace, 1, "userpace version too low"); err = EPROTONOSUPPORT; goto out; } data->fuse_libabi_major = fiio->major; data->fuse_libabi_minor = fiio->minor; if (fuse_libabi_geq(data, 7, 5)) { if (fticket_resp(tick)->len == sizeof(struct fuse_init_out)) { + data->max_readahead = fiio->max_readahead; data->max_write = fiio->max_write; if (fiio->flags & FUSE_ASYNC_READ) data->dataflags |= FSESS_ASYNC_READ; if (fiio->flags & FUSE_POSIX_LOCKS) data->dataflags |= FSESS_POSIX_LOCKS; if (fiio->flags & FUSE_EXPORT_SUPPORT) data->dataflags |= FSESS_EXPORT_SUPPORT; /* * Don't bother to check FUSE_BIG_WRITES, because it's * redundant with max_write */ } else { err = EINVAL; } } else { /* Old fix values */ data->max_write = 4096; } out: if (err) { fdata_set_dead(data); } FUSE_LOCK(); data->dataflags |= FSESS_INITED; wakeup(&data->ticketer); FUSE_UNLOCK(); return 0; } void fuse_internal_send_init(struct fuse_data *data, struct thread *td) { struct fuse_init_in *fiii; struct fuse_dispatcher fdi; fdisp_init(&fdi, sizeof(*fiii)); fdisp_make(&fdi, FUSE_INIT, data->mp, 0, td, NULL); fiii = fdi.indata; fiii->major = FUSE_KERNEL_VERSION; fiii->minor = FUSE_KERNEL_MINOR_VERSION; /* - * fusefs currently doesn't do any readahead other than fetching whole - * buffer cache block sized regions at once. So the max readahead is - * the size of a buffer cache block. + * fusefs currently reads ahead no more than one cache block at a time. + * See fuse_read_biobackend */ fiii->max_readahead = maxbcachebuf; /* * Unsupported features: * FUSE_FILE_OPS: No known FUSE server or client supports it * FUSE_ATOMIC_O_TRUNC: our VFS cannot support it * FUSE_DONT_MASK: unlike Linux, FreeBSD always applies the umask, even * when default ACLs are in use. */ fiii->flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES; fuse_insert_callback(fdi.tick, fuse_internal_init_callback); fuse_insert_message(fdi.tick, false); fdisp_destroy(&fdi); } /* * Send a FUSE_SETATTR operation with no permissions checks. If cred is NULL, * send the request with root credentials */ int fuse_internal_setattr(struct vnode *vp, struct vattr *vap, struct thread *td, struct ucred *cred) { struct fuse_dispatcher fdi; struct fuse_setattr_in *fsai; struct mount *mp; pid_t pid = td->td_proc->p_pid; struct fuse_data *data; int dataflags; int err = 0; enum vtype vtyp; int sizechanged = -1; uint64_t newsize = 0; mp = vnode_mount(vp); data = fuse_get_mpdata(mp); dataflags = data->dataflags; fdisp_init(&fdi, sizeof(*fsai)); fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred); if (!cred) { fdi.finh->uid = 0; fdi.finh->gid = 0; } fsai = fdi.indata; fsai->valid = 0; if (vap->va_uid != (uid_t)VNOVAL) { fsai->uid = vap->va_uid; fsai->valid |= FATTR_UID; } if (vap->va_gid != (gid_t)VNOVAL) { fsai->gid = vap->va_gid; fsai->valid |= FATTR_GID; } if (vap->va_size != VNOVAL) { struct fuse_filehandle *fufh = NULL; /*Truncate to a new value. */ fsai->size = vap->va_size; sizechanged = 1; newsize = vap->va_size; fsai->valid |= FATTR_SIZE; fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); if (fufh) { fsai->fh = fufh->fh_id; fsai->valid |= FATTR_FH; } VTOFUD(vp)->flag &= ~FN_SIZECHANGE; } if (vap->va_atime.tv_sec != VNOVAL) { fsai->atime = vap->va_atime.tv_sec; fsai->atimensec = vap->va_atime.tv_nsec; fsai->valid |= FATTR_ATIME; if (vap->va_vaflags & VA_UTIMES_NULL) fsai->valid |= FATTR_ATIME_NOW; } if (vap->va_mtime.tv_sec != VNOVAL) { fsai->mtime = vap->va_mtime.tv_sec; fsai->mtimensec = vap->va_mtime.tv_nsec; fsai->valid |= FATTR_MTIME; if (vap->va_vaflags & VA_UTIMES_NULL) fsai->valid |= FATTR_MTIME_NOW; } if (vap->va_mode != (mode_t)VNOVAL) { fsai->mode = vap->va_mode & ALLPERMS; fsai->valid |= FATTR_MODE; } if (!fsai->valid) { goto out; } if ((err = fdisp_wait_answ(&fdi))) goto out; vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode); if (vnode_vtype(vp) != vtyp) { if (vnode_vtype(vp) == VNON && vtyp != VNON) { SDT_PROBE2(fusefs, , internal, trace, 1, "FUSE: Dang! " "vnode_vtype is VNON and vtype isn't."); } else { /* * STALE vnode, ditch * * The vnode has changed its type "behind our back". * There's nothing really we can do, so let us just * force an internal revocation and tell the caller to * try again, if interested. */ fuse_internal_vnode_disappear(vp); err = EAGAIN; } } if (err == 0) { struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ; fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, fao->attr_valid_nsec, NULL); } out: fdisp_destroy(&fdi); return err; } #ifdef ZERO_PAD_INCOMPLETE_BUFS static int isbzero(void *buf, size_t len) { int i; for (i = 0; i < len; i++) { if (((char *)buf)[i]) return (0); } return (1); } #endif diff --git a/sys/fs/fuse/fuse_io.c b/sys/fs/fuse/fuse_io.c index e2a7c19d1cfa..e923d93bfcdb 100644 --- a/sys/fs/fuse/fuse_io.c +++ b/sys/fs/fuse/fuse_io.c @@ -1,1020 +1,1039 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_node.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_io.h" SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fusefs, , io, trace, "int", "char*"); static void fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred, struct thread *td); static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh); static int fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid); static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, int ioflag, bool pages); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid); /* * FreeBSD clears the SUID and SGID bits on any write by a non-root user. */ static void fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred, struct thread *td) { struct fuse_data *data; struct mount *mp; struct vattr va; int dataflags; mp = vnode_mount(vp); data = fuse_get_mpdata(mp); dataflags = data->dataflags; if (dataflags & FSESS_DEFAULT_PERMISSIONS) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { fuse_internal_getattr(vp, &va, cred, td); if (va.va_mode & (S_ISUID | S_ISGID)) { mode_t mode = va.va_mode & ~(S_ISUID | S_ISGID); /* Clear all vattr fields except mode */ vattr_null(&va); va.va_mode = mode; /* * Ignore fuse_internal_setattr's return value, * because at this point the write operation has * already succeeded and we don't want to return * failing status for that. */ (void)fuse_internal_setattr(vp, &va, td, NULL); } } } } SDT_PROBE_DEFINE5(fusefs, , io, io_dispatch, "struct vnode*", "struct uio*", "int", "struct ucred*", "struct fuse_filehandle*"); int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, bool pages, struct ucred *cred, pid_t pid) { struct fuse_filehandle *fufh; int err, directio; int fflag; bool closefufh = false; MPASS(vp->v_type == VREG || vp->v_type == VDIR); fflag = (uio->uio_rw == UIO_READ) ? FREAD : FWRITE; err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { /* * nfsd will do I/O without first doing VOP_OPEN. We * must implicitly open the file here */ err = fuse_filehandle_open(vp, fflag, &fufh, curthread, cred); closefufh = true; } else if (err) { printf("FUSE: io dispatch: filehandles are closed\n"); return err; } if (err) goto out; SDT_PROBE5(fusefs, , io, io_dispatch, vp, uio, ioflag, cred, fufh); /* * Ideally, when the daemon asks for direct io at open time, the * standard file flag should be set according to this, so that would * just change the default mode, which later on could be changed via * fcntl(2). * But this doesn't work, the O_DIRECT flag gets cleared at some point * (don't know where). So to make any use of the Fuse direct_io option, * we hardwire it into the file's private data (similarly to Linux, * btw.). */ directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); switch (uio->uio_rw) { case UIO_READ: if (directio) { SDT_PROBE2(fusefs, , io, trace, 1, "direct read of vnode"); err = fuse_read_directbackend(vp, uio, cred, fufh); } else { SDT_PROBE2(fusefs, , io, trace, 1, "buffered read of vnode"); err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh, pid); } break; case UIO_WRITE: if (directio) { const int iosize = fuse_iosize(vp); off_t start, end, filesize; SDT_PROBE2(fusefs, , io, trace, 1, "direct write of vnode"); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) goto out; start = uio->uio_offset; end = start + uio->uio_resid; /* * Invalidate the write cache unless we're coming from * VOP_PUTPAGES, in which case we're writing _from_ the * write cache */ if (!pages ) v_inval_buf_range(vp, start, end, iosize); err = fuse_write_directbackend(vp, uio, cred, fufh, filesize, ioflag, pages); } else { SDT_PROBE2(fusefs, , io, trace, 1, "buffered write of vnode"); if (fuse_data_cache_mode == FUSE_CACHE_WT) ioflag |= IO_SYNC; err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag, pid); } fuse_io_clear_suid_on_write(vp, cred, uio->uio_td); break; default: panic("uninterpreted mode passed to fuse_io_dispatch"); } out: if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); return (err); } SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int"); SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "struct buf*"); SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int", "struct buf*"); static int fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid) { struct buf *bp; - daddr_t lbn; - int bcount; - int err, n = 0, on = 0; + struct mount *mp; + struct fuse_data *data; + daddr_t lbn, nextlbn; + int bcount, nextsize; + int err, n = 0, on = 0, seqcount; off_t filesize; const int biosize = fuse_iosize(vp); + mp = vnode_mount(vp); + data = fuse_get_mpdata(mp); if (uio->uio_offset < 0) return (EINVAL); + seqcount = ioflag >> IO_SEQSHIFT; + err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) return err; for (err = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if (fuse_isdeadfs(vp)) { err = ENXIO; break; } if (filesize - uio->uio_offset <= 0) break; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); if ((off_t)lbn * biosize >= filesize) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { bcount = filesize - (off_t)lbn *biosize; } else { bcount = biosize; } + nextlbn = lbn + 1; + nextsize = MIN(biosize, filesize - nextlbn * biosize); SDT_PROBE4(fusefs, , io, read_bio_backend_start, biosize, (int)lbn, on, bcount); - /* TODO: readahead. See ext2_read for an example */ - err = bread(vp, lbn, bcount, NOCRED, &bp); + if (bcount < biosize) { + /* If near EOF, don't do readahead */ + err = bread(vp, lbn, bcount, NOCRED, &bp); + /* TODO: clustered read */ + } else if (seqcount > 1 && data->max_readahead >= nextsize) { + /* Try non-clustered readahead */ + err = breadn(vp, lbn, bcount, &nextlbn, &nextsize, 1, + NOCRED, &bp); + } else { + /* Just read what was requested */ + err = bread(vp, lbn, bcount, NOCRED, &bp); + } + if (err) { brelse(bp); bp = NULL; break; } /* * on is the offset into the current bp. Figure out how many * bytes we can copy out of the bp. Note that bcount is * NOT DEV_BSIZE aligned. * * Then figure out how many bytes we can copy into the uio. */ n = 0; if (on < bcount) n = MIN((unsigned)(bcount - on), uio->uio_resid); if (n > 0) { SDT_PROBE2(fusefs, , io, read_bio_backend_feed, n, bp); err = uiomove(bp->b_data + on, n, uio); } vfs_bio_brelse(bp, ioflag); SDT_PROBE4(fusefs, , io, read_bio_backend_end, err, uio->uio_resid, n, bp); } return (err); } SDT_PROBE_DEFINE1(fusefs, , io, read_directbackend_start, "struct fuse_read_in*"); SDT_PROBE_DEFINE3(fusefs, , io, read_directbackend_complete, "struct fuse_dispatcher*", "struct fuse_read_in*", "struct uio*"); static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh) { struct fuse_data *data; struct fuse_dispatcher fdi; struct fuse_read_in *fri; int err = 0; data = fuse_get_mpdata(vp->v_mount); if (uio->uio_resid == 0) return (0); fdisp_init(&fdi, 0); /* * XXX In "normal" case we use an intermediate kernel buffer for * transmitting data from daemon's context to ours. Eventually, we should * get rid of this. Anyway, if the target uio lives in sysspace (we are * called from pageops), and the input data doesn't need kernel-side * processing (we are not called from readdir) we can already invoke * an optimized, "peer-to-peer" I/O routine. */ while (uio->uio_resid > 0) { fdi.iosize = sizeof(*fri); fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); fri = fdi.indata; fri->fh = fufh->fh_id; fri->offset = uio->uio_offset; fri->size = MIN(uio->uio_resid, fuse_get_mpdata(vp->v_mount)->max_read); if (fuse_libabi_geq(data, 7, 9)) { /* See comment regarding FUSE_WRITE_LOCKOWNER */ fri->read_flags = 0; fri->flags = fufh_type_2_fflags(fufh->fufh_type); } SDT_PROBE1(fusefs, , io, read_directbackend_start, fri); if ((err = fdisp_wait_answ(&fdi))) goto out; SDT_PROBE3(fusefs, , io, read_directbackend_complete, &fdi, fri, uio); if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) break; if (fdi.iosize < fri->size) break; } out: fdisp_destroy(&fdi); return (err); } static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, int ioflag, bool pages) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_data *data; struct fuse_write_in *fwi; struct fuse_write_out *fwo; struct fuse_dispatcher fdi; size_t chunksize; void *fwi_data; off_t as_written_offset; int diff; int err = 0; bool direct_io = fufh->fuse_open_flags & FOPEN_DIRECT_IO; uint32_t write_flags; data = fuse_get_mpdata(vp->v_mount); /* * Don't set FUSE_WRITE_LOCKOWNER in write_flags. It can't be set * accurately when using POSIX AIO, libfuse doesn't use it, and I'm not * aware of any file systems that do. It was an attempt to add * Linux-style mandatory locking to the FUSE protocol, but mandatory * locking is deprecated even on Linux. See Linux commit * f33321141b273d60cbb3a8f56a5489baad82ba5e . */ /* * Set FUSE_WRITE_CACHE whenever we don't know the uid, gid, and/or pid * that originated a write. For example when writing from the * writeback cache. I don't know of a single file system that cares, * but the protocol says we're supposed to do this. */ write_flags = !pages && ( (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)) || fuse_data_cache_mode != FUSE_CACHE_WB) ? 0 : FUSE_WRITE_CACHE; if (uio->uio_resid == 0) return (0); if (ioflag & IO_APPEND) uio_setoffset(uio, filesize); if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); fdisp_init(&fdi, 0); while (uio->uio_resid > 0) { chunksize = MIN(uio->uio_resid, data->max_write); fdi.iosize = sizeof(*fwi) + chunksize; fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); fwi = fdi.indata; fwi->fh = fufh->fh_id; fwi->offset = uio->uio_offset; fwi->size = chunksize; fwi->write_flags = write_flags; if (fuse_libabi_geq(data, 7, 9)) { fwi->flags = fufh_type_2_fflags(fufh->fufh_type); fwi_data = (char *)fdi.indata + sizeof(*fwi); } else { fwi_data = (char *)fdi.indata + FUSE_COMPAT_WRITE_IN_SIZE; } if ((err = uiomove(fwi_data, chunksize, uio))) break; retry: err = fdisp_wait_answ(&fdi); if (err == ERESTART || err == EINTR || err == EWOULDBLOCK) { /* * Rewind the uio so dofilewrite will know it's * incomplete */ uio->uio_resid += fwi->size; uio->uio_offset -= fwi->size; /* * Change ERESTART into EINTR because we can't rewind * uio->uio_iov. Basically, once uiomove(9) has been * called, it's impossible to restart a syscall. */ if (err == ERESTART) err = EINTR; break; } else if (err) { break; } fwo = ((struct fuse_write_out *)fdi.answ); /* Adjust the uio in the case of short writes */ diff = fwi->size - fwo->size; as_written_offset = uio->uio_offset - diff; if (as_written_offset - diff > filesize && fuse_data_cache_mode != FUSE_CACHE_UC) fuse_vnode_setsize(vp, as_written_offset); if (as_written_offset - diff >= filesize) fvdat->flag &= ~FN_SIZECHANGE; if (diff < 0) { printf("WARNING: misbehaving FUSE filesystem " "wrote more data than we provided it\n"); err = EINVAL; break; } else if (diff > 0) { /* Short write */ if (!direct_io) { printf("WARNING: misbehaving FUSE filesystem: " "short writes are only allowed with " "direct_io\n"); } if (ioflag & IO_DIRECT) { /* Return early */ uio->uio_resid += diff; uio->uio_offset -= diff; break; } else { /* Resend the unwritten portion of data */ fdi.iosize = sizeof(*fwi) + diff; /* Refresh fdi without clearing data buffer */ fdisp_refresh_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); fwi = fdi.indata; MPASS2(fwi == fdi.indata, "FUSE dispatcher " "reallocated despite no increase in " "size?"); void *src = (char*)fwi_data + fwo->size; memmove(fwi_data, src, diff); fwi->fh = fufh->fh_id; fwi->offset = as_written_offset; fwi->size = diff; fwi->write_flags = write_flags; goto retry; } } } fdisp_destroy(&fdi); return (err); } SDT_PROBE_DEFINE6(fusefs, , io, write_biobackend_start, "int64_t", "int", "int", "struct uio*", "int", "bool"); SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_append_race, "long", "int"); SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_issue, "int", "struct buf*"); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct buf *bp; daddr_t lbn; off_t filesize; int bcount; int n, on, seqcount, err = 0; bool last_page; const int biosize = fuse_iosize(vp); seqcount = ioflag >> IO_SEQSHIFT; KASSERT(uio->uio_rw == UIO_WRITE, ("fuse_write_biobackend mode")); if (vp->v_type != VREG) return (EIO); if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (0); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) return err; if (ioflag & IO_APPEND) uio_setoffset(uio, filesize); if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); do { bool direct_append, extending; if (fuse_isdeadfs(vp)) { err = ENXIO; break; } lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); n = MIN((unsigned)(biosize - on), uio->uio_resid); again: /* Get or create a buffer for the write */ direct_append = uio->uio_offset == filesize && n; if (uio->uio_offset + n < filesize) { extending = false; if ((off_t)(lbn + 1) * biosize < filesize) { /* Not the file's last block */ bcount = biosize; } else { /* The file's last block */ bcount = filesize - (off_t)lbn * biosize; } } else { extending = true; bcount = on + n; } if (howmany(((off_t)lbn * biosize + on + n - 1), PAGE_SIZE) >= howmany(filesize, PAGE_SIZE)) last_page = true; else last_page = false; if (direct_append) { /* * Take care to preserve the buffer's B_CACHE state so * as not to cause an unnecessary read. */ bp = getblk(vp, lbn, on, PCATCH, 0, 0); if (bp != NULL) { uint32_t save = bp->b_flags & B_CACHE; allocbuf(bp, bcount); bp->b_flags |= save; } } else { bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); } if (!bp) { err = EINTR; break; } if (extending) { /* * Extend file _after_ locking buffer so we won't race * with other readers */ err = fuse_vnode_setsize(vp, uio->uio_offset + n); filesize = uio->uio_offset + n; fvdat->flag |= FN_SIZECHANGE; if (err) { brelse(bp); break; } } SDT_PROBE6(fusefs, , io, write_biobackend_start, lbn, on, n, uio, bcount, direct_append); /* * Issue a READ if B_CACHE is not set. In special-append * mode, B_CACHE is based on the buffer prior to the write * op and is typically set, avoiding the read. If a read * is required in special append mode, the server will * probably send us a short-read since we extended the file * on our end, resulting in b_resid == 0 and, thusly, * B_CACHE getting set. * * We can also avoid issuing the read if the write covers * the entire buffer. We have to make sure the buffer state * is reasonable in this case since we will not be initiating * I/O. See the comments in kern/vfs_bio.c's getblk() for * more information. * * B_CACHE may also be set due to the buffer being cached * normally. */ if (on == 0 && n == bcount) { bp->b_flags |= B_CACHE; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; } if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); fuse_io_strategy(vp, bp); if ((err = bp->b_error)) { brelse(bp); break; } } if (bp->b_wcred == NOCRED) bp->b_wcred = crhold(cred); /* * If dirtyend exceeds file size, chop it down. This should * not normally occur but there is an append race where it * might occur XXX, so we log it. * * If the chopping creates a reverse-indexed or degenerate * situation with dirtyoff/end, we 0 both of them. */ if (bp->b_dirtyend > bcount) { SDT_PROBE2(fusefs, , io, write_biobackend_append_race, (long)bp->b_blkno * biosize, bp->b_dirtyend - bcount); bp->b_dirtyend = bcount; } if (bp->b_dirtyoff >= bp->b_dirtyend) bp->b_dirtyoff = bp->b_dirtyend = 0; /* * If the new write will leave a contiguous dirty * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. * * While it is possible to merge discontiguous writes due to * our having a B_CACHE buffer ( and thus valid read data * for the hole), we don't because it could lead to * significant cache coherency problems with multiple clients, * especially if locking is implemented later on. * * as an optimization we could theoretically maintain * a linked list of discontinuous areas, but we would still * have to commit them separately so there isn't much * advantage to it except perhaps a bit of asynchronization. */ if (bp->b_dirtyend > 0 && (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { /* * Yes, we mean it. Write out everything to "storage" * immediately, without hesitation. (Apart from other * reasons: the only way to know if a write is valid * if its actually written out.) */ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 0, bp); bwrite(bp); if (bp->b_error == EINTR) { err = EINTR; break; } goto again; } err = uiomove((char *)bp->b_data + on, n, uio); if (err) { bp->b_ioflags |= BIO_ERROR; bp->b_error = err; brelse(bp); break; /* TODO: vfs_bio_clrbuf like ffs_write does? */ } /* * Only update dirtyoff/dirtyend if not a degenerate * condition. */ if (n) { if (bp->b_dirtyend > 0) { bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); } else { bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } vfs_bio_set_valid(bp, on, n); } vfs_bio_set_flags(bp, ioflag); if (last_page) { /* * When writing the last page of a file we must write * synchronously. If we didn't, then a subsequent * operation could extend the file, making the last * page of this buffer invalid because it would only be * partially cached. * * As an optimization, it would be allowable to only * write the last page synchronously. Or, it should be * possible to synchronously flush the last * already-written page whenever extending a file with * ftruncate or another write. */ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 1, bp); err = bwrite(bp); } else if (ioflag & IO_SYNC) { SDT_PROBE2(fusefs, , io, write_biobackend_issue, 2, bp); err = bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; SDT_PROBE2(fusefs, , io, write_biobackend_issue, 3, bp); bawrite(bp); } else if (on == 0 && n == bcount) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; SDT_PROBE2(fusefs, , io, write_biobackend_issue, 4, bp); cluster_write(vp, bp, filesize, seqcount, 0); } else { SDT_PROBE2(fusefs, , io, write_biobackend_issue, 5, bp); bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; SDT_PROBE2(fusefs, , io, write_biobackend_issue, 6, bp); bawrite(bp); } else { bp->b_flags &= ~B_CLUSTEROK; SDT_PROBE2(fusefs, , io, write_biobackend_issue, 7, bp); bdwrite(bp); } if (err) break; } while (uio->uio_resid > 0 && n > 0); return (err); } int fuse_io_strategy(struct vnode *vp, struct buf *bp) { struct fuse_filehandle *fufh; struct ucred *cred; struct uio *uiop; struct uio uio; struct iovec io; off_t filesize; int error = 0; int fflag; /* We don't know the true pid when we're dealing with the cache */ pid_t pid = 0; const int biosize = fuse_iosize(vp); MPASS(vp->v_type == VREG || vp->v_type == VDIR); MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); fflag = bp->b_iocmd == BIO_READ ? FREAD : FWRITE; cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; error = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); if (bp->b_iocmd == BIO_READ && error == EBADF) { /* * This may be a read-modify-write operation on a cached file * opened O_WRONLY. The FUSE protocol allows this. */ error = fuse_filehandle_get(vp, FWRITE, &fufh, cred, pid); } if (error) { printf("FUSE: strategy: filehandles are closed\n"); bp->b_ioflags |= BIO_ERROR; bp->b_error = error; bufdone(bp); return (error); } uiop = &uio; uiop->uio_iov = &io; uiop->uio_iovcnt = 1; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = curthread; /* * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We * do this here so we do not have to do it in all the code that * calls us. */ bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; KASSERT(!(bp->b_flags & B_DONE), ("fuse_io_strategy: bp %p already marked done", bp)); if (bp->b_iocmd == BIO_READ) { io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize; error = fuse_read_directbackend(vp, uiop, cred, fufh); if (!error && uiop->uio_resid) { /* * If we had a short read with no error, we must have * hit a file hole. We should zero-fill the remainder. * This can also occur if the server hits the file EOF. * * Holes used to be able to occur due to pending * writes, but that is not possible any longer. */ int nread = bp->b_bcount - uiop->uio_resid; int left = uiop->uio_resid; if (left > 0) bzero((char *)bp->b_data + nread, left); uiop->uio_resid = 0; } if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } } else { /* * Setup for actual write */ error = fuse_vnode_size(vp, &filesize, cred, curthread); if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; bufdone(bp); return (error); } if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize) bp->b_dirtyend = filesize - (off_t)bp->b_lblkno * biosize; if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; uiop->uio_offset = (off_t)bp->b_lblkno * biosize + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; error = fuse_write_directbackend(vp, uiop, cred, fufh, filesize, 0, false); if (error == EINTR || error == ETIMEDOUT) { bp->b_flags &= ~(B_INVAL | B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); bp->b_flags &= ~B_DONE; } if ((error == EINTR || error == ETIMEDOUT) && (bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; } else { if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_flags |= B_INVAL; bp->b_error = error; } bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; bufdone(bp); return (0); } } bp->b_resid = uiop->uio_resid; bufdone(bp); return (error); } int fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td) { return (vn_fsync_buf(vp, waitfor)); } /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. */ int fuse_io_invalbuf(struct vnode *vp, struct thread *td) { struct fuse_vnode_data *fvdat = VTOFUD(vp); int error = 0; if (vp->v_iflag & VI_DOOMED) return 0; ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf"); while (fvdat->flag & FN_FLUSHINPROG) { struct proc *p = td->td_proc; if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) return EIO; fvdat->flag |= FN_FLUSHWANT; tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz); error = 0; if (p != NULL) { PROC_LOCK(p); if (SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist)) error = EINTR; PROC_UNLOCK(p); } if (error == EINTR) return EINTR; } fvdat->flag |= FN_FLUSHINPROG; if (vp->v_bufobj.bo_object != NULL) { VM_OBJECT_WLOCK(vp->v_bufobj.bo_object); vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object); } error = vinvalbuf(vp, V_SAVE, PCATCH, 0); while (error) { if (error == ERESTART || error == EINTR) { fvdat->flag &= ~FN_FLUSHINPROG; if (fvdat->flag & FN_FLUSHWANT) { fvdat->flag &= ~FN_FLUSHWANT; wakeup(&fvdat->flag); } return EINTR; } error = vinvalbuf(vp, V_SAVE, PCATCH, 0); } fvdat->flag &= ~FN_FLUSHINPROG; if (fvdat->flag & FN_FLUSHWANT) { fvdat->flag &= ~FN_FLUSHWANT; wakeup(&fvdat->flag); } return (error); } diff --git a/sys/fs/fuse/fuse_ipc.h b/sys/fs/fuse/fuse_ipc.h index 89a17d7f7e23..440a4f671490 100644 --- a/sys/fs/fuse/fuse_ipc.h +++ b/sys/fs/fuse/fuse_ipc.h @@ -1,424 +1,425 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _FUSE_IPC_H_ #define _FUSE_IPC_H_ #include #include struct fuse_iov { void *base; size_t len; size_t allocated_size; int credit; }; void fiov_init(struct fuse_iov *fiov, size_t size); void fiov_teardown(struct fuse_iov *fiov); void fiov_refresh(struct fuse_iov *fiov); void fiov_adjust(struct fuse_iov *fiov, size_t size); #define FUSE_DIMALLOC(fiov, spc1, spc2, amnt) do { \ fiov_adjust(fiov, (sizeof(*(spc1)) + (amnt))); \ (spc1) = (fiov)->base; \ (spc2) = (char *)(fiov)->base + (sizeof(*(spc1))); \ } while (0) #define FU_AT_LEAST(siz) max((siz), 160) #define FUSE_ASSERT_AW_DONE(ftick) \ KASSERT((ftick)->tk_aw_link.tqe_next == NULL && \ (ftick)->tk_aw_link.tqe_prev == NULL, \ ("FUSE: ticket still on answer delivery list %p", (ftick))) #define FUSE_ASSERT_MS_DONE(ftick) \ KASSERT((ftick)->tk_ms_link.stqe_next == NULL, \ ("FUSE: ticket still on message list %p", (ftick))) struct fuse_ticket; struct fuse_data; typedef int fuse_handler_t(struct fuse_ticket *ftick, struct uio *uio); struct fuse_ticket { /* fields giving the identity of the ticket */ uint64_t tk_unique; struct fuse_data *tk_data; int tk_flag; u_int tk_refcount; /* * If this ticket's operation has been interrupted, this will hold the * unique value of the FUSE_INTERRUPT operation. Otherwise, it will be * 0. */ uint64_t irq_unique; /* fields for initiating an upgoing message */ struct fuse_iov tk_ms_fiov; void *tk_ms_bufdata; size_t tk_ms_bufsize; enum { FT_M_FIOV, FT_M_BUF } tk_ms_type; STAILQ_ENTRY(fuse_ticket) tk_ms_link; /* fields for handling answers coming from userspace */ struct fuse_iov tk_aw_fiov; void *tk_aw_bufdata; size_t tk_aw_bufsize; enum { FT_A_FIOV, FT_A_BUF } tk_aw_type; struct fuse_out_header tk_aw_ohead; int tk_aw_errno; struct mtx tk_aw_mtx; fuse_handler_t *tk_aw_handler; TAILQ_ENTRY(fuse_ticket) tk_aw_link; }; #define FT_ANSW 0x01 /* request of ticket has already been answered */ #define FT_DIRTY 0x04 /* ticket has been used */ static inline struct fuse_iov * fticket_resp(struct fuse_ticket *ftick) { return (&ftick->tk_aw_fiov); } static inline bool fticket_answered(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_aw_mtx, MA_OWNED); return (ftick->tk_flag & FT_ANSW); } static inline void fticket_set_answered(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_aw_mtx, MA_OWNED); ftick->tk_flag |= FT_ANSW; } static inline struct fuse_in_header* fticket_in_header(struct fuse_ticket *ftick) { return (struct fuse_in_header *)(ftick->tk_ms_fiov.base); } static inline enum fuse_opcode fticket_opcode(struct fuse_ticket *ftick) { return fticket_in_header(ftick)->opcode; } int fticket_pull(struct fuse_ticket *ftick, struct uio *uio); /* * The data representing a FUSE session. */ struct fuse_data { struct cdev *fdev; struct mount *mp; struct vnode *vroot; struct ucred *daemoncred; int dataflags; int ref; struct mtx ms_mtx; STAILQ_HEAD(, fuse_ticket) ms_head; int ms_count; struct mtx aw_mtx; TAILQ_HEAD(, fuse_ticket) aw_head; /* * Holds the next value of the FUSE operation unique value. * Also, serves as a wakeup channel to prevent any operations from * being created before INIT completes. */ u_long ticketer; struct sx rename_lock; uint32_t fuse_libabi_major; uint32_t fuse_libabi_minor; + uint32_t max_readahead; uint32_t max_write; uint32_t max_read; uint32_t subtype; char volname[MAXPATHLEN]; struct selinfo ks_rsel; int daemon_timeout; uint64_t notimpl; uint64_t mnt_flag; }; #define FSESS_DEAD 0x0001 /* session is to be closed */ #define FSESS_UNUSED0 0x0002 /* unused */ #define FSESS_INITED 0x0004 /* session has been inited */ #define FSESS_DAEMON_CAN_SPY 0x0010 /* let non-owners access this fs */ /* (and being observed by the daemon) */ #define FSESS_PUSH_SYMLINKS_IN 0x0020 /* prefix absolute symlinks with mp */ #define FSESS_DEFAULT_PERMISSIONS 0x0040 /* kernel does permission checking */ #define FSESS_NO_ATTRCACHE 0x0080 /* no attribute caching */ #define FSESS_NO_READAHEAD 0x0100 /* no readaheads */ #define FSESS_NO_DATACACHE 0x0200 /* disable buffer cache */ #define FSESS_NO_NAMECACHE 0x0400 /* disable name cache */ #define FSESS_NO_MMAP 0x0800 /* disable mmap */ #define FSESS_ASYNC_READ 0x1000 /* allow multiple reads of some file */ #define FSESS_POSIX_LOCKS 0x2000 /* daemon supports POSIX locks */ #define FSESS_EXPORT_SUPPORT 0x10000 /* daemon supports NFS-style lookups */ #define FSESS_MNTOPTS_MASK ( \ FSESS_DAEMON_CAN_SPY | FSESS_PUSH_SYMLINKS_IN | \ FSESS_DEFAULT_PERMISSIONS | FSESS_NO_ATTRCACHE | \ FSESS_NO_READAHEAD | FSESS_NO_DATACACHE | \ FSESS_NO_NAMECACHE | FSESS_NO_MMAP) enum fuse_data_cache_mode { FUSE_CACHE_UC, FUSE_CACHE_WT, FUSE_CACHE_WB, }; extern int fuse_data_cache_mode; static inline struct fuse_data * fuse_get_mpdata(struct mount *mp) { return mp->mnt_data; } static inline bool fsess_isimpl(struct mount *mp, int opcode) { struct fuse_data *data = fuse_get_mpdata(mp); return ((data->notimpl & (1ULL << opcode)) == 0); } static inline void fsess_set_notimpl(struct mount *mp, int opcode) { struct fuse_data *data = fuse_get_mpdata(mp); data->notimpl |= (1ULL << opcode); } static inline bool fsess_opt_datacache(struct mount *mp) { struct fuse_data *data = fuse_get_mpdata(mp); return (fuse_data_cache_mode != FUSE_CACHE_UC && (data->dataflags & FSESS_NO_DATACACHE) == 0); } static inline bool fsess_opt_mmap(struct mount *mp) { struct fuse_data *data = fuse_get_mpdata(mp); if (fuse_data_cache_mode == FUSE_CACHE_UC) return (false); return ((data->dataflags & (FSESS_NO_DATACACHE | FSESS_NO_MMAP)) == 0); } /* Insert a new upgoing message */ static inline void fuse_ms_push(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED); refcount_acquire(&ftick->tk_refcount); STAILQ_INSERT_TAIL(&ftick->tk_data->ms_head, ftick, tk_ms_link); ftick->tk_data->ms_count++; } /* Insert a new upgoing message to the front of the queue */ static inline void fuse_ms_push_head(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED); refcount_acquire(&ftick->tk_refcount); STAILQ_INSERT_HEAD(&ftick->tk_data->ms_head, ftick, tk_ms_link); ftick->tk_data->ms_count++; } static inline struct fuse_ticket * fuse_ms_pop(struct fuse_data *data) { struct fuse_ticket *ftick = NULL; mtx_assert(&data->ms_mtx, MA_OWNED); if ((ftick = STAILQ_FIRST(&data->ms_head))) { STAILQ_REMOVE_HEAD(&data->ms_head, tk_ms_link); data->ms_count--; #ifdef INVARIANTS MPASS(data->ms_count >= 0); ftick->tk_ms_link.stqe_next = NULL; #endif } return (ftick); } static inline void fuse_aw_push(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_data->aw_mtx, MA_OWNED); refcount_acquire(&ftick->tk_refcount); TAILQ_INSERT_TAIL(&ftick->tk_data->aw_head, ftick, tk_aw_link); } static inline void fuse_aw_remove(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_data->aw_mtx, MA_OWNED); TAILQ_REMOVE(&ftick->tk_data->aw_head, ftick, tk_aw_link); #ifdef INVARIANTS ftick->tk_aw_link.tqe_next = NULL; ftick->tk_aw_link.tqe_prev = NULL; #endif } static inline struct fuse_ticket * fuse_aw_pop(struct fuse_data *data) { struct fuse_ticket *ftick; mtx_assert(&data->aw_mtx, MA_OWNED); if ((ftick = TAILQ_FIRST(&data->aw_head)) != NULL) fuse_aw_remove(ftick); return (ftick); } struct fuse_ticket *fuse_ticket_fetch(struct fuse_data *data); int fuse_ticket_drop(struct fuse_ticket *ftick); void fuse_insert_callback(struct fuse_ticket *ftick, fuse_handler_t *handler); void fuse_insert_message(struct fuse_ticket *ftick, bool irq); static inline bool fuse_libabi_geq(struct fuse_data *data, uint32_t abi_maj, uint32_t abi_min) { return (data->fuse_libabi_major > abi_maj || (data->fuse_libabi_major == abi_maj && data->fuse_libabi_minor >= abi_min)); } struct fuse_data *fdata_alloc(struct cdev *dev, struct ucred *cred); void fdata_trydestroy(struct fuse_data *data); void fdata_set_dead(struct fuse_data *data); static inline bool fdata_get_dead(struct fuse_data *data) { return (data->dataflags & FSESS_DEAD); } struct fuse_dispatcher { struct fuse_ticket *tick; struct fuse_in_header *finh; void *indata; size_t iosize; uint64_t nodeid; int answ_stat; void *answ; }; static inline void fdisp_init(struct fuse_dispatcher *fdisp, size_t iosize) { fdisp->iosize = iosize; fdisp->tick = NULL; } static inline void fdisp_destroy(struct fuse_dispatcher *fdisp) { fuse_ticket_drop(fdisp->tick); #ifdef INVARIANTS fdisp->tick = NULL; #endif } void fdisp_refresh(struct fuse_dispatcher *fdip); void fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred); void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred); void fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred); int fdisp_wait_answ(struct fuse_dispatcher *fdip); static inline int fdisp_simple_putget_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred) { fdisp_make_vp(fdip, op, vp, td, cred); return (fdisp_wait_answ(fdip)); } #endif /* _FUSE_IPC_H_ */ diff --git a/tests/sys/fs/fusefs/read.cc b/tests/sys/fs/fusefs/read.cc index e986c1310144..18e7217a2faa 100644 --- a/tests/sys/fs/fusefs/read.cc +++ b/tests/sys/fs/fusefs/read.cc @@ -1,783 +1,786 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 The FreeBSD Foundation * * This software was developed by BFF Storage Systems, LLC under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ extern "C" { #include #include #include #include #include #include #include #include #include } #include "mockfs.hh" #include "utils.hh" using namespace testing; class Read: public FuseTest { public: void expect_lookup(const char *relpath, uint64_t ino, uint64_t size) { FuseTest::expect_lookup(relpath, ino, S_IFREG | 0644, size, 1); } }; class Read_7_8: public FuseTest { public: virtual void SetUp() { m_kernel_minor_version = 8; FuseTest::SetUp(); } void expect_lookup(const char *relpath, uint64_t ino, uint64_t size) { FuseTest::expect_lookup_7_8(relpath, ino, S_IFREG | 0644, size, 1); } }; class AioRead: public Read { public: virtual void SetUp() { const char *node = "vfs.aio.enable_unsafe"; int val = 0; size_t size = sizeof(val); FuseTest::SetUp(); ASSERT_EQ(0, sysctlbyname(node, &val, &size, NULL, 0)) << strerror(errno); if (!val) GTEST_SKIP() << "vfs.aio.enable_unsafe must be set for this test"; } }; class AsyncRead: public AioRead { virtual void SetUp() { m_init_flags = FUSE_ASYNC_READ; AioRead::SetUp(); } }; class ReadCacheable: public Read { public: virtual void SetUp() { const char *node = "vfs.fusefs.data_cache_mode"; int val = 0; size_t size = sizeof(val); FuseTest::SetUp(); ASSERT_EQ(0, sysctlbyname(node, &val, &size, NULL, 0)) << strerror(errno); if (val == 0) GTEST_SKIP() << "fusefs data caching must be enabled for this test"; } }; class ReadAhead: public ReadCacheable, public WithParamInterface { virtual void SetUp() { m_maxreadahead = GetParam(); - Read::SetUp(); + ReadCacheable::SetUp(); } }; /* AIO reads need to set the header's pid field correctly */ /* https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=236379 */ TEST_F(AioRead, aio_read) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; ssize_t bufsize = strlen(CONTENTS); char buf[bufsize]; struct aiocb iocb, *piocb; expect_lookup(RELPATH, ino, bufsize); expect_open(ino, 0, 1); expect_read(ino, 0, bufsize, bufsize, CONTENTS); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); iocb.aio_nbytes = bufsize; iocb.aio_fildes = fd; iocb.aio_buf = buf; iocb.aio_offset = 0; iocb.aio_sigevent.sigev_notify = SIGEV_NONE; ASSERT_EQ(0, aio_read(&iocb)) << strerror(errno); ASSERT_EQ(bufsize, aio_waitcomplete(&piocb, NULL)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, CONTENTS, bufsize)); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* * Without the FUSE_ASYNC_READ mount option, fuse(4) should ensure that there * is at most one outstanding read operation per file handle */ TEST_F(AioRead, async_read_disabled) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; uint64_t ino = 42; int fd; ssize_t bufsize = 50; char buf0[bufsize], buf1[bufsize]; off_t off0 = 0; off_t off1 = 65536; struct aiocb iocb0, iocb1; volatile sig_atomic_t read_count = 0; expect_lookup(RELPATH, ino, 131072); expect_open(ino, 0, 1); EXPECT_CALL(*m_mock, process( ResultOf([=](auto in) { return (in.header.opcode == FUSE_READ && in.header.nodeid == ino && in.body.read.fh == FH && in.body.read.offset == (uint64_t)off0); }, Eq(true)), _) ).WillRepeatedly(Invoke([&](auto in __unused, auto &out __unused) { read_count++; /* Filesystem is slow to respond */ })); EXPECT_CALL(*m_mock, process( ResultOf([=](auto in) { return (in.header.opcode == FUSE_READ && in.header.nodeid == ino && in.body.read.fh == FH && in.body.read.offset == (uint64_t)off1); }, Eq(true)), _) ).WillRepeatedly(Invoke([&](auto in __unused, auto &out __unused) { read_count++; /* Filesystem is slow to respond */ })); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); /* * Submit two AIO read requests, and respond to neither. If the * filesystem ever gets the second read request, then we failed to * limit outstanding reads. */ iocb0.aio_nbytes = bufsize; iocb0.aio_fildes = fd; iocb0.aio_buf = buf0; iocb0.aio_offset = off0; iocb0.aio_sigevent.sigev_notify = SIGEV_NONE; ASSERT_EQ(0, aio_read(&iocb0)) << strerror(errno); iocb1.aio_nbytes = bufsize; iocb1.aio_fildes = fd; iocb1.aio_buf = buf1; iocb1.aio_offset = off1; iocb1.aio_sigevent.sigev_notify = SIGEV_NONE; ASSERT_EQ(0, aio_read(&iocb1)) << strerror(errno); /* * Sleep for awhile to make sure the kernel has had a chance to issue * the second read, even though the first has not yet returned */ nap(); EXPECT_EQ(read_count, 1); m_mock->kill_daemon(); /* Wait for AIO activity to complete, but ignore errors */ (void)aio_waitcomplete(NULL, NULL); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* * With the FUSE_ASYNC_READ mount option, fuse(4) may issue multiple * simultaneous read requests on the same file handle. */ TEST_F(AsyncRead, async_read) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; uint64_t ino = 42; int fd; ssize_t bufsize = 50; char buf0[bufsize], buf1[bufsize]; off_t off0 = 0; off_t off1 = 65536; struct aiocb iocb0, iocb1; sem_t sem; ASSERT_EQ(0, sem_init(&sem, 0, 0)) << strerror(errno); expect_lookup(RELPATH, ino, 131072); expect_open(ino, 0, 1); EXPECT_CALL(*m_mock, process( ResultOf([=](auto in) { return (in.header.opcode == FUSE_READ && in.header.nodeid == ino && in.body.read.fh == FH && in.body.read.offset == (uint64_t)off0); }, Eq(true)), _) ).WillOnce(Invoke([&](auto in __unused, auto &out __unused) { sem_post(&sem); /* Filesystem is slow to respond */ })); EXPECT_CALL(*m_mock, process( ResultOf([=](auto in) { return (in.header.opcode == FUSE_READ && in.header.nodeid == ino && in.body.read.fh == FH && in.body.read.offset == (uint64_t)off1); }, Eq(true)), _) ).WillOnce(Invoke([&](auto in __unused, auto &out __unused) { sem_post(&sem); /* Filesystem is slow to respond */ })); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); /* * Submit two AIO read requests, but respond to neither. Ensure that * we received both. */ iocb0.aio_nbytes = bufsize; iocb0.aio_fildes = fd; iocb0.aio_buf = buf0; iocb0.aio_offset = off0; iocb0.aio_sigevent.sigev_notify = SIGEV_NONE; ASSERT_EQ(0, aio_read(&iocb0)) << strerror(errno); iocb1.aio_nbytes = bufsize; iocb1.aio_fildes = fd; iocb1.aio_buf = buf1; iocb1.aio_offset = off1; iocb1.aio_sigevent.sigev_notify = SIGEV_NONE; ASSERT_EQ(0, aio_read(&iocb1)) << strerror(errno); /* Wait until both reads have reached the daemon */ ASSERT_EQ(0, sem_wait(&sem)) << strerror(errno); ASSERT_EQ(0, sem_wait(&sem)) << strerror(errno); m_mock->kill_daemon(); /* Wait for AIO activity to complete, but ignore errors */ (void)aio_waitcomplete(NULL, NULL); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* 0-length reads shouldn't cause any confusion */ TEST_F(Read, direct_io_read_nothing) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; uint64_t ino = 42; int fd; uint64_t offset = 100; char buf[80]; expect_lookup(RELPATH, ino, offset + 1000); expect_open(ino, FOPEN_DIRECT_IO, 1); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); ASSERT_EQ(0, pread(fd, buf, 0, offset)) << strerror(errno); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* * With direct_io, reads should not fill the cache. They should go straight to * the daemon */ TEST_F(Read, direct_io_pread) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; uint64_t offset = 100; ssize_t bufsize = strlen(CONTENTS); char buf[bufsize]; expect_lookup(RELPATH, ino, offset + bufsize); expect_open(ino, FOPEN_DIRECT_IO, 1); expect_read(ino, offset, bufsize, bufsize, CONTENTS); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); ASSERT_EQ(bufsize, pread(fd, buf, bufsize, offset)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, CONTENTS, bufsize)); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* * With direct_io, filesystems are allowed to return less data than is * requested. fuse(4) should return a short read to userland. */ TEST_F(Read, direct_io_short_read) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefghijklmnop"; uint64_t ino = 42; int fd; uint64_t offset = 100; ssize_t bufsize = strlen(CONTENTS); ssize_t halfbufsize = bufsize / 2; char buf[bufsize]; expect_lookup(RELPATH, ino, offset + bufsize); expect_open(ino, FOPEN_DIRECT_IO, 1); expect_read(ino, offset, bufsize, halfbufsize, CONTENTS); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); ASSERT_EQ(halfbufsize, pread(fd, buf, bufsize, offset)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, CONTENTS, halfbufsize)); /* Deliberately leak fd. close(2) will be tested in release.cc */ } TEST_F(Read, eio) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; ssize_t bufsize = strlen(CONTENTS); char buf[bufsize]; expect_lookup(RELPATH, ino, bufsize); expect_open(ino, 0, 1); EXPECT_CALL(*m_mock, process( ResultOf([=](auto in) { return (in.header.opcode == FUSE_READ); }, Eq(true)), _) ).WillOnce(Invoke(ReturnErrno(EIO))); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); ASSERT_EQ(-1, read(fd, buf, bufsize)) << strerror(errno); ASSERT_EQ(EIO, errno); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* * With the keep_cache option, the kernel may keep its read cache across * multiple open(2)s. */ TEST_F(ReadCacheable, keep_cache) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd0, fd1; ssize_t bufsize = strlen(CONTENTS); char buf[bufsize]; FuseTest::expect_lookup(RELPATH, ino, S_IFREG | 0644, bufsize, 2); expect_open(ino, FOPEN_KEEP_CACHE, 2); expect_read(ino, 0, bufsize, bufsize, CONTENTS); fd0 = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd0) << strerror(errno); ASSERT_EQ(bufsize, read(fd0, buf, bufsize)) << strerror(errno); fd1 = open(FULLPATH, O_RDWR); ASSERT_LE(0, fd1) << strerror(errno); /* * This read should be serviced by cache, even though it's on the other * file descriptor */ ASSERT_EQ(bufsize, read(fd1, buf, bufsize)) << strerror(errno); /* Deliberately leak fd0 and fd1. */ } /* * Without the keep_cache option, the kernel should drop its read caches on * every open */ TEST_F(Read, keep_cache_disabled) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd0, fd1; ssize_t bufsize = strlen(CONTENTS); char buf[bufsize]; FuseTest::expect_lookup(RELPATH, ino, S_IFREG | 0644, bufsize, 2); expect_open(ino, 0, 2); expect_read(ino, 0, bufsize, bufsize, CONTENTS); fd0 = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd0) << strerror(errno); ASSERT_EQ(bufsize, read(fd0, buf, bufsize)) << strerror(errno); fd1 = open(FULLPATH, O_RDWR); ASSERT_LE(0, fd1) << strerror(errno); /* * This read should not be serviced by cache, even though it's on the * original file descriptor */ expect_read(ino, 0, bufsize, bufsize, CONTENTS); ASSERT_EQ(0, lseek(fd0, 0, SEEK_SET)) << strerror(errno); ASSERT_EQ(bufsize, read(fd0, buf, bufsize)) << strerror(errno); /* Deliberately leak fd0 and fd1. */ } TEST_F(ReadCacheable, mmap) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; ssize_t len; size_t bufsize = strlen(CONTENTS); void *p; len = getpagesize(); expect_lookup(RELPATH, ino, bufsize); expect_open(ino, 0, 1); /* mmap may legitimately try to read more data than is available */ EXPECT_CALL(*m_mock, process( ResultOf([=](auto in) { return (in.header.opcode == FUSE_READ && in.header.nodeid == ino && in.body.read.fh == Read::FH && in.body.read.offset == 0 && in.body.read.size >= bufsize); }, Eq(true)), _) ).WillOnce(Invoke(ReturnImmediate([=](auto in __unused, auto& out) { out.header.len = sizeof(struct fuse_out_header) + bufsize; memmove(out.body.bytes, CONTENTS, bufsize); }))); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); p = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); ASSERT_NE(MAP_FAILED, p) << strerror(errno); ASSERT_EQ(0, memcmp(p, CONTENTS, bufsize)); ASSERT_EQ(0, munmap(p, len)) << strerror(errno); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* * Just as when FOPEN_DIRECT_IO is used, reads with O_DIRECT should bypass * cache and to straight to the daemon */ TEST_F(Read, o_direct) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; ssize_t bufsize = strlen(CONTENTS); char buf[bufsize]; expect_lookup(RELPATH, ino, bufsize); expect_open(ino, 0, 1); expect_read(ino, 0, bufsize, bufsize, CONTENTS); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); // Fill the cache ASSERT_EQ(bufsize, read(fd, buf, bufsize)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, CONTENTS, bufsize)); // Reads with o_direct should bypass the cache expect_read(ino, 0, bufsize, bufsize, CONTENTS); ASSERT_EQ(0, fcntl(fd, F_SETFL, O_DIRECT)) << strerror(errno); ASSERT_EQ(0, lseek(fd, 0, SEEK_SET)) << strerror(errno); ASSERT_EQ(bufsize, read(fd, buf, bufsize)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, CONTENTS, bufsize)); /* Deliberately leak fd. close(2) will be tested in release.cc */ } TEST_F(Read, pread) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; /* * Set offset to a maxbcachebuf boundary so we'll be sure what offset * to read from. Without this, the read might start at a lower offset. */ uint64_t offset = m_maxbcachebuf; ssize_t bufsize = strlen(CONTENTS); char buf[bufsize]; expect_lookup(RELPATH, ino, offset + bufsize); expect_open(ino, 0, 1); expect_read(ino, offset, bufsize, bufsize, CONTENTS); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); ASSERT_EQ(bufsize, pread(fd, buf, bufsize, offset)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, CONTENTS, bufsize)); /* Deliberately leak fd. close(2) will be tested in release.cc */ } TEST_F(Read, read) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; ssize_t bufsize = strlen(CONTENTS); char buf[bufsize]; expect_lookup(RELPATH, ino, bufsize); expect_open(ino, 0, 1); expect_read(ino, 0, bufsize, bufsize, CONTENTS); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); ASSERT_EQ(bufsize, read(fd, buf, bufsize)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, CONTENTS, bufsize)); /* Deliberately leak fd. close(2) will be tested in release.cc */ } TEST_F(Read_7_8, read) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; ssize_t bufsize = strlen(CONTENTS); char buf[bufsize]; expect_lookup(RELPATH, ino, bufsize); expect_open(ino, 0, 1); expect_read(ino, 0, bufsize, bufsize, CONTENTS); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); ASSERT_EQ(bufsize, read(fd, buf, bufsize)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, CONTENTS, bufsize)); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* * If cacheing is enabled, the kernel should try to read an entire cache block * at a time. */ TEST_F(ReadCacheable, cache_block) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS0 = "abcdefghijklmnop"; uint64_t ino = 42; int fd; ssize_t bufsize = 8; ssize_t filesize = m_maxbcachebuf * 2; char *contents; char buf[bufsize]; const char *contents1 = CONTENTS0 + bufsize; contents = (char*)calloc(1, filesize); ASSERT_NE(NULL, contents); memmove(contents, CONTENTS0, strlen(CONTENTS0)); expect_lookup(RELPATH, ino, filesize); expect_open(ino, 0, 1); expect_read(ino, 0, m_maxbcachebuf, m_maxbcachebuf, contents); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); ASSERT_EQ(bufsize, read(fd, buf, bufsize)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, CONTENTS0, bufsize)); /* A subsequent read should be serviced by cache */ ASSERT_EQ(bufsize, read(fd, buf, bufsize)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, contents1, bufsize)); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* Reading with sendfile should work (though it obviously won't be 0-copy) */ TEST_F(ReadCacheable, sendfile) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; size_t bufsize = strlen(CONTENTS); char buf[bufsize]; int sp[2]; off_t sbytes; expect_lookup(RELPATH, ino, bufsize); expect_open(ino, 0, 1); /* Like mmap, sendfile may request more data than is available */ EXPECT_CALL(*m_mock, process( ResultOf([=](auto in) { return (in.header.opcode == FUSE_READ && in.header.nodeid == ino && in.body.read.fh == Read::FH && in.body.read.offset == 0 && in.body.read.size >= bufsize); }, Eq(true)), _) ).WillOnce(Invoke(ReturnImmediate([=](auto in __unused, auto& out) { out.header.len = sizeof(struct fuse_out_header) + bufsize; memmove(out.body.bytes, CONTENTS, bufsize); }))); ASSERT_EQ(0, socketpair(PF_LOCAL, SOCK_STREAM, 0, sp)) << strerror(errno); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); ASSERT_EQ(0, sendfile(fd, sp[1], 0, bufsize, NULL, &sbytes, 0)) << strerror(errno); ASSERT_EQ(static_cast(bufsize), read(sp[0], buf, bufsize)) << strerror(errno); ASSERT_EQ(0, memcmp(buf, CONTENTS, bufsize)); close(sp[1]); close(sp[0]); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* sendfile should fail gracefully if fuse declines the read */ /* https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=236466 */ TEST_F(ReadCacheable, DISABLED_sendfile_eio) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; ssize_t bufsize = strlen(CONTENTS); int sp[2]; off_t sbytes; expect_lookup(RELPATH, ino, bufsize); expect_open(ino, 0, 1); EXPECT_CALL(*m_mock, process( ResultOf([=](auto in) { return (in.header.opcode == FUSE_READ); }, Eq(true)), _) ).WillOnce(Invoke(ReturnErrno(EIO))); ASSERT_EQ(0, socketpair(PF_LOCAL, SOCK_STREAM, 0, sp)) << strerror(errno); fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); ASSERT_NE(0, sendfile(fd, sp[1], 0, bufsize, NULL, &sbytes, 0)); close(sp[1]); close(sp[0]); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* fuse(4) should honor the filesystem's requested m_readahead parameter */ -/* https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=236472 */ -TEST_P(ReadAhead, DISABLED_readahead) { +TEST_P(ReadAhead, readahead) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; - const char *CONTENTS0 = "abcdefghijklmnop"; uint64_t ino = 42; - int fd; - ssize_t bufsize = 8; - ssize_t filesize = m_maxbcachebuf * 2; - char *contents; - char buf[bufsize]; - - ASSERT_TRUE(GetParam() < (uint32_t)m_maxbcachebuf) - << "Test assumes that max_readahead < maxbcachebuf"; + int fd, i; + ssize_t bufsize = m_maxbcachebuf; + ssize_t filesize = m_maxbcachebuf * 4; + char *rbuf, *contents; - contents = (char*)calloc(1, filesize); + contents = (char*)malloc(filesize); ASSERT_NE(NULL, contents); - memmove(contents, CONTENTS0, strlen(CONTENTS0)); + memset(contents, 'X', filesize); + rbuf = (char*)calloc(1, bufsize); expect_lookup(RELPATH, ino, filesize); expect_open(ino, 0, 1); /* fuse(4) should only read ahead the allowed amount */ - expect_read(ino, 0, GetParam(), GetParam(), contents); + expect_read(ino, 0, m_maxbcachebuf, m_maxbcachebuf, contents); + for (i = 0; i < (int)GetParam() / m_maxbcachebuf; i++) { + off_t offs = (i + 1) * m_maxbcachebuf; + expect_read(ino, offs, m_maxbcachebuf, m_maxbcachebuf, + contents + offs); + } fd = open(FULLPATH, O_RDONLY); ASSERT_LE(0, fd) << strerror(errno); - ASSERT_EQ(bufsize, read(fd, buf, bufsize)) << strerror(errno); - ASSERT_EQ(0, memcmp(buf, CONTENTS0, bufsize)); + /* Set the internal readahead counter to a "large" value */ + ASSERT_EQ(0, fcntl(fd, F_READAHEAD, 1'000'000'000)) << strerror(errno); + + ASSERT_EQ(bufsize, read(fd, rbuf, bufsize)) << strerror(errno); + ASSERT_EQ(0, memcmp(rbuf, contents, bufsize)); /* Deliberately leak fd. close(2) will be tested in release.cc */ } -INSTANTIATE_TEST_CASE_P(RA, ReadAhead, ::testing::Values(0u, 2048u)); +INSTANTIATE_TEST_CASE_P(RA, ReadAhead, ::testing::Values(0u, 65536)); diff --git a/tests/sys/fs/fusefs/utils.hh b/tests/sys/fs/fusefs/utils.hh index 82b32bceafeb..968683617062 100644 --- a/tests/sys/fs/fusefs/utils.hh +++ b/tests/sys/fs/fusefs/utils.hh @@ -1,212 +1,208 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 The FreeBSD Foundation * * This software was developed by BFF Storage Systems, LLC under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ struct _sem; typedef struct _sem sem_t; /* Nanoseconds to sleep, for tests that must */ #define NAP_NS (100'000'000) void get_unprivileged_id(uid_t *uid, gid_t *gid); inline void nap() { usleep(NAP_NS / 1000); } extern const uint32_t libfuse_max_write; extern const uint32_t default_max_write; class FuseTest : public ::testing::Test { protected: uint32_t m_maxreadahead; uint32_t m_maxwrite; uint32_t m_init_flags; bool m_allow_other; bool m_default_permissions; uint32_t m_kernel_minor_version; enum poll_method m_pm; bool m_push_symlinks_in; bool m_ro; bool m_async; MockFS *m_mock = NULL; const static uint64_t FH = 0xdeadbeef1a7ebabe; public: int m_maxbcachebuf; FuseTest(): - /* - * libfuse's default max_readahead is UINT_MAX, though it can - * be lowered - */ - m_maxreadahead(UINT_MAX), + m_maxreadahead(0), m_maxwrite(default_max_write), m_init_flags(0), m_allow_other(false), m_default_permissions(false), m_kernel_minor_version(FUSE_KERNEL_MINOR_VERSION), m_pm(BLOCKING), m_push_symlinks_in(false), m_ro(false), m_async(false) {} virtual void SetUp(); virtual void TearDown() { if (m_mock) delete m_mock; } /* * Create an expectation that FUSE_ACCESS will be called once for the * given inode with the given access_mode, returning the given errno */ void expect_access(uint64_t ino, mode_t access_mode, int error); /* Expect FUSE_DESTROY and shutdown the daemon */ void expect_destroy(int error); /* * Create an expectation that FUSE_FLUSH will be called times times for * the given inode */ void expect_flush(uint64_t ino, int times, ProcessMockerT r); /* * Create an expectation that FUSE_FORGET will be called for the given * inode. There will be no response. If sem is provided, it will be * posted after the operation is received by the daemon. */ void expect_forget(uint64_t ino, uint64_t nlookup, sem_t *sem = NULL); /* * Create an expectation that FUSE_GETATTR will be called for the given * inode any number of times. It will respond with a few basic * attributes, like the given size and the mode S_IFREG | 0644 */ void expect_getattr(uint64_t ino, uint64_t size); /* * Create an expectation that FUSE_LOOKUP will be called for the given * path exactly times times and cache validity period. It will respond * with inode ino, mode mode, filesize size. */ void expect_lookup(const char *relpath, uint64_t ino, mode_t mode, uint64_t size, int times, uint64_t attr_valid = UINT64_MAX, uid_t uid = 0, gid_t gid = 0); /* The protocol 7.8 version of expect_lookup */ void expect_lookup_7_8(const char *relpath, uint64_t ino, mode_t mode, uint64_t size, int times, uint64_t attr_valid = UINT64_MAX, uid_t uid = 0, gid_t gid = 0); /* * Create an expectation that FUSE_OPEN will be called for the given * inode exactly times times. It will return with open_flags flags and * file handle FH. */ void expect_open(uint64_t ino, uint32_t flags, int times); /* * Create an expectation that FUSE_OPENDIR will be called exactly once * for inode ino. */ void expect_opendir(uint64_t ino); /* * Create an expectation that FUSE_READ will be called exactly once for * the given inode, at offset offset and with size isize. It will * return the first osize bytes from contents * * Protocol 7.8 tests can use this same expectation method because * nothing currently validates the size of the fuse_read_in struct. */ void expect_read(uint64_t ino, uint64_t offset, uint64_t isize, uint64_t osize, const void *contents, int flags = -1); /* * Create an expectation that FUSE_READIR will be called any number of * times on the given ino with the given offset, returning (by copy) * the provided entries */ void expect_readdir(uint64_t ino, uint64_t off, std::vector &ents); /* * Create an expectation that FUSE_RELEASE will be called exactly once * for the given inode and filehandle, returning success */ void expect_release(uint64_t ino, uint64_t fh); /* * Create an expectation that FUSE_RELEASEDIR will be called exactly * once for the given inode */ void expect_releasedir(uint64_t ino, ProcessMockerT r); /* * Create an expectation that FUSE_UNLINK will be called exactly once * for the given path, returning an errno */ void expect_unlink(uint64_t parent, const char *path, int error); /* * Create an expectation that FUSE_WRITE will be called exactly once * for the given inode, at offset offset, with size isize and buffer * contents. Any flags present in flags_set must be set, and any * present in flags_unset must not be set. Other flags are don't care. * It will return osize. */ void expect_write(uint64_t ino, uint64_t offset, uint64_t isize, uint64_t osize, uint32_t flags_set, uint32_t flags_unset, const void *contents); /* Protocol 7.8 version of expect_write */ void expect_write_7_8(uint64_t ino, uint64_t offset, uint64_t isize, uint64_t osize, const void *contents); /* * Helper that runs code in a child process. * * First, parent_func runs in the parent process. * Then, child_func runs in the child process, dropping privileges if * desired. * Finally, fusetest_fork returns. * * # Returns * * fusetest_fork may SKIP the test, which the caller should detect with * the IsSkipped() method. If not, then the child's exit status will * be returned in status. */ void fork(bool drop_privs, int *status, std::function parent_func, std::function child_func); };