Index: sys/amd64/conf/GENERIC =================================================================== --- sys/amd64/conf/GENERIC +++ sys/amd64/conf/GENERIC @@ -99,6 +99,8 @@ options DEADLKRES # Enable the deadlock resolver options INVARIANTS # Enable calls of extra sanity checking options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +options DEBUG_VFS_LOCKS +options KTR options QUEUE_MACRO_DEBUG_TRASH # Trash queue(2) internal pointers on invalidation options WITNESS # Enable checks to detect deadlocks and cycles options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed Index: sys/fs/fuse/fuse_file.c =================================================================== --- sys/fs/fuse/fuse_file.c +++ sys/fs/fuse/fuse_file.c @@ -171,10 +171,11 @@ { struct fuse_dispatcher fdi; struct fuse_release_in *fri; - int err = 0; int op = FUSE_RELEASE; + ASSERT_VOP_ELOCKED(vp, __func__); + if (fuse_isdeadfs(vp)) { goto out; } @@ -218,6 +219,8 @@ struct fuse_filehandle *fufh; fufh_type_t fufh_type = fflags_2_fufh_type(mode); + ASSERT_VOP_LOCKED(vp, __func__); + /* * Unlike fuse_filehandle_get, we want to search for a filehandle with * the exact cred, and no fallback @@ -253,6 +256,8 @@ struct fuse_filehandle *fufh; fufh_type_t fufh_type; + ASSERT_VOP_LOCKED(vp, __func__); + fufh_type = fflags_2_fufh_type(fflag); /* cred can be NULL for in-kernel clients */ if (cred == NULL) @@ -290,6 +295,8 @@ struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; + ASSERT_VOP_LOCKED(vp, __func__); + if (cred == NULL) goto fallback; @@ -333,6 +340,8 @@ struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; + ASSERT_VOP_ELOCKED(vp, __func__); + fufh = malloc(sizeof(struct fuse_filehandle), M_FUSE_FILEHANDLE, M_WAITOK); MPASS(fufh != NULL); @@ -352,7 +361,6 @@ counter_u64_add(fuse_fh_count, 1); if (foo->open_flags & FOPEN_DIRECT_IO) { - ASSERT_VOP_ELOCKED(vp, __func__); VTOFUD(vp)->flag |= FN_DIRECTIO; fuse_io_invalbuf(vp, td); } else { Index: sys/fs/fuse/fuse_internal.c =================================================================== --- sys/fs/fuse/fuse_internal.c +++ sys/fs/fuse/fuse_internal.c @@ -337,6 +337,8 @@ int op = FUSE_FSYNC; int err = 0; + ASSERT_VOP_LOCKED(vp, __func__); + if (fsess_not_impl(vnode_mount(vp), (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) { return 0; @@ -411,10 +413,10 @@ return (0); if (fnieo.parent == FUSE_ROOT_ID) - err = VFS_ROOT(mp, LK_SHARED, &dvp); + err = VFS_ROOT(mp, LK_EXCLUSIVE, &dvp); else err = fuse_internal_get_cached_vnode( mp, fnieo.parent, - LK_SHARED, &dvp); + LK_EXCLUSIVE, &dvp); SDT_PROBE3(fusefs, , internal, invalidate_entry, dvp, &fnieo, name); /* * If dvp is not in the cache, then it must've been reclaimed. And @@ -459,8 +461,8 @@ if (fniio.ino == FUSE_ROOT_ID) err = VFS_ROOT(mp, LK_EXCLUSIVE, &vp); else - err = fuse_internal_get_cached_vnode(mp, fniio.ino, LK_SHARED, - &vp); + err = fuse_internal_get_cached_vnode(mp, fniio.ino, + LK_EXCLUSIVE, &vp); SDT_PROBE2(fusefs, , internal, invalidate_inode, vp, &fniio); if (err != 0 || vp == NULL) return (err); @@ -681,6 +683,8 @@ nlink_t nlink; int err = 0; + ASSERT_VOP_ELOCKED(vp, __func__); + fdisp_init(&fdi, cnp->cn_namelen + 1); fdisp_make_vp(&fdi, op, dvp, cnp->cn_thread, cnp->cn_cred); @@ -887,6 +891,8 @@ enum vtype vtyp; int err; + ASSERT_VOP_LOCKED(vp, __func__); + fdisp_init(&fdi, sizeof(*fgai)); fdisp_make_vp(&fdi, FUSE_GETATTR, vp, td, cred); fgai = fdi.indata; @@ -932,8 +938,18 @@ int iosize = fuse_iosize(vp); v_inval_buf_range(vp, 0, INT64_MAX, iosize); } - fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, - fao->attr_valid_nsec, vap); + if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { + fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, + fao->attr_valid_nsec, vap); + } else if (vn_lock(vp, LK_TRYUPGRADE) == 0) { + fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, + fao->attr_valid_nsec, vap); + vn_lock(vp, LK_DOWNGRADE); + } else { + /* We can't update the cache. Too bad. */ + SDT_PROBE2(fusefs, , internal, trace, 1, + "Failed to upgrade lock"); + } if (vtyp != vnode_vtype(vp)) { fuse_internal_vnode_disappear(vp); err = ENOENT; @@ -1135,6 +1151,8 @@ int sizechanged = -1; uint64_t newsize = 0; + ASSERT_VOP_ELOCKED(vp, __func__); + mp = vnode_mount(vp); fvdat = VTOFUD(vp); data = fuse_get_mpdata(mp); Index: sys/fs/fuse/fuse_io.c =================================================================== --- sys/fs/fuse/fuse_io.c +++ sys/fs/fuse/fuse_io.c @@ -982,7 +982,16 @@ */ SDT_PROBE2(fusefs, , io, trace, 1, "Short read of a clean file"); - fuse_vnode_clear_attr_cache(vp); + if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { + fuse_vnode_clear_attr_cache(vp); + } else if (vn_lock(vp, LK_TRYUPGRADE) == 0) { + fuse_vnode_clear_attr_cache(vp); + vn_lock(vp, LK_DOWNGRADE); + } else { + /* We can't clear the attr cache. */ + SDT_PROBE2(fusefs, , io, trace, 1, + "Failed to upgrade lock"); + } } else { /* * If dirty writes _are_ cached beyond EOF, Index: sys/fs/fuse/fuse_node.h =================================================================== --- sys/fs/fuse/fuse_node.h +++ sys/fs/fuse/fuse_node.h @@ -101,20 +101,27 @@ uint64_t parent_nid; /** I/O **/ - /* List of file handles for all of the vnode's open file descriptors */ + /* + * List of file handles for all of the vnode's open file descriptors. + * Protected by the vnode lock. + */ LIST_HEAD(, fuse_filehandle) handles; /** flags **/ uint32_t flag; /** meta **/ - /* The monotonic time after which the attr cache is invalid */ + /* + * The monotonic time after which the attr cache is invalid, protected + * by the vnode lock. + */ struct bintime attr_cache_timeout; /* * Monotonic time after which the entry is invalid. Used for lookups * by nodeid instead of pathname. */ struct bintime entry_cache_timeout; + /* Cached file attributes, protected by the vnode lock */ struct vattr cached_attrs; uint64_t nlookup; enum vtype vtype; @@ -139,6 +146,8 @@ { struct bintime now; + ASSERT_VOP_LOCKED(vp, __func__); + getbinuptime(&now); return (bintime_cmp(&(VTOFUD(vp)->attr_cache_timeout), &now, >)); } @@ -146,6 +155,8 @@ static inline struct vattr* VTOVA(struct vnode *vp) { + ASSERT_VOP_LOCKED(vp, __func__); + if (fuse_vnode_attr_cache_valid(vp)) return &(VTOFUD(vp)->cached_attrs); else @@ -155,6 +166,8 @@ static inline void fuse_vnode_clear_attr_cache(struct vnode *vp) { + ASSERT_VOP_ELOCKED(vp, __func__); + bintime_clear(&VTOFUD(vp)->attr_cache_timeout); } Index: sys/fs/fuse/fuse_node.c =================================================================== --- sys/fs/fuse/fuse_node.c +++ sys/fs/fuse/fuse_node.c @@ -350,7 +350,7 @@ struct fuse_setattr_in *fsai; int err = 0; - ASSERT_VOP_ELOCKED(vp, "fuse_io_extend"); + ASSERT_VOP_ELOCKED(vp, __func__); if (fuse_isdeadfs(vp)) { return EBADF; @@ -449,6 +449,8 @@ struct fuse_vnode_data *fvdat = VTOFUD(vp); int error = 0; + ASSERT_VOP_LOCKED(vp, __func__); + if (!(fvdat->flag & FN_SIZECHANGE) && (!fuse_vnode_attr_cache_valid(vp) || fvdat->cached_attrs.va_size == VNOVAL)) @@ -476,6 +478,8 @@ struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); struct timespec ts; + ASSERT_VOP_ELOCKED(vp, __func__); + vfs_timestamp(&ts); if (data->time_gran > 1) Index: sys/fs/fuse/fuse_vfsops.c =================================================================== --- sys/fs/fuse/fuse_vfsops.c +++ sys/fs/fuse/fuse_vfsops.c @@ -576,6 +576,7 @@ error = fuse_vnode_get(mp, feo, nodeid, NULL, vpp, NULL, vtyp); if (error) goto out; + ASSERT_VOP_ELOCKED(*vpp, __func__); filesize = feo->attr.size; /* Index: sys/fs/fuse/fuse_vnops.c =================================================================== --- sys/fs/fuse/fuse_vnops.c +++ sys/fs/fuse/fuse_vnops.c @@ -1378,6 +1378,7 @@ * * following attribute cache expiration, or * * due a bug in the daemon, or */ + ASSERT_VOP_LOCKED(vp, __func__); fvdat = VTOFUD(vp); if (vnode_isreg(vp) && ((filesize != fvdat->cached_attrs.va_size && @@ -1825,6 +1826,8 @@ return err; } +SDT_PROBE_DEFINE4(fusefs, , vnops, erelookup, "struct vnode*", + "struct vnode*", "struct vnode*", "struct vnode*"); /* struct vnop_rename_args { struct vnode *a_fdvp; @@ -1871,10 +1874,26 @@ */ data = fuse_get_mpdata(vnode_mount(tdvp)); if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) { - err = fuse_internal_access(fvp, VWRITE, - tcnp->cn_thread, tcnp->cn_cred); - if (err) + /* + * Must use LK_NOWAIT to prevent LORs between fvp and tdvp or + * tvp + */ + if (vn_lock(fvp, LK_SHARED | LK_NOWAIT) == 0) { + err = fuse_internal_access(fvp, VWRITE, + tcnp->cn_thread, tcnp->cn_cred); + VOP_UNLOCK(fvp); + if (err) + goto out; + } else { + /* + * Can't release tdvp or tvp to try releasing the LOR. + * Must return instead. + */ + SDT_PROBE4(fusefs, , vnops, erelookup, fdvp, fvp, tdvp, + tvp); + err = ERELOOKUP; goto out; + } } sx_xlock(&data->rename_lock); err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp); @@ -2179,6 +2198,8 @@ struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; + /* vp should be ELOCKED, because we don't set MNTK_SHARED_WRITES */ + ASSERT_VOP_ELOCKED(vp, __func__); if (fuse_isdeadfs(vp)) { return ENXIO; } Index: tests/sys/fs/fusefs/default_permissions.cc =================================================================== --- tests/sys/fs/fusefs/default_permissions.cc +++ tests/sys/fs/fusefs/default_permissions.cc @@ -1019,6 +1019,83 @@ ASSERT_EQ(EPERM, errno); } +static volatile int stopit = 0; + +static void* setattr_th(void* arg) { + char *path = (char*)arg; + + while (stopit == 0) + chmod(path, 0777); + return 0; +} + +/* + * Rename restarts the syscall to avoid a LOR + * + * This test triggers a race: the chmod() calls VOP_SETATTR, which locks its + * vnode, but fuse_vnop_rename also tries to lock the same vnode in order to + * use FUSE_GETATTR on it. The result is that, in order to avoid a LOR, + * fuse_vnop_rename returns ERELOOKUP to restart the syscall. + * + * To verify that the race is hit, watch the fusefs:fusefs:vnops:erelookup + * dtrace probe while the test is running. On my system, that probe fires 1 - 5 + * times per second during this test. + */ +TEST_F(Rename, erelookup) +{ + const char FULLDST[] = "mountpoint/dstdir/dst"; + const char RELDSTDIR[] = "dstdir"; + const char RELDST[] = "dst"; + const char FULLSRC[] = "mountpoint/src"; + const char RELSRC[] = "src"; + pthread_t th0; + uint64_t ino = 42; + uint64_t dst_dir_ino = 43; + struct timespec now, timeout; + + expect_getattr(FUSE_ROOT_ID, S_IFDIR | 0777, UINT64_MAX, 1, geteuid()); + EXPECT_LOOKUP(FUSE_ROOT_ID, RELSRC) + .WillRepeatedly(Invoke( + ReturnImmediate([=](auto i __unused, auto& out) { + SET_OUT_HEADER_LEN(out, entry); + out.body.entry.attr.mode = S_IFDIR | 0644; + out.body.entry.nodeid = ino; + out.body.entry.attr.nlink = 2; + out.body.entry.attr_valid = UINT64_MAX; + out.body.entry.attr.uid = 0; + out.body.entry.attr.gid = 0; + out.body.entry.entry_valid = UINT64_MAX; + })) + ); + EXPECT_LOOKUP(FUSE_ROOT_ID, RELDSTDIR) + .WillRepeatedly(Invoke(ReturnImmediate([=](auto in __unused, auto& out) + { + SET_OUT_HEADER_LEN(out, entry); + out.body.entry.nodeid = dst_dir_ino; + out.body.entry.entry_valid = UINT64_MAX; + out.body.entry.attr_valid = UINT64_MAX; + out.body.entry.attr.mode = S_IFDIR | 0755; + out.body.entry.attr.ino = dst_dir_ino; + out.body.entry.attr.uid = geteuid(); + out.body.entry.attr.gid = getegid(); + }))); + EXPECT_LOOKUP(dst_dir_ino, RELDST) + .WillRepeatedly(Invoke(ReturnErrno(ENOENT))); + + ASSERT_EQ(0, pthread_create(&th0, NULL, setattr_th, (void*)FULLSRC)) + << strerror(errno); + + ASSERT_EQ(0, clock_gettime(CLOCK_MONOTONIC, &timeout)); + timeout.tv_sec += 1; + do { + ASSERT_EQ(-1, rename(FULLSRC, FULLDST)); + EXPECT_EQ(EACCES, errno); + ASSERT_EQ(0, clock_gettime(CLOCK_MONOTONIC, &now)); + } while (timespeccmp(&now, &timeout, <)); + stopit = 1; + pthread_join(th0, NULL); +} + /* Successfully rename a file, overwriting the destination */ TEST_F(Rename, ok) { Index: tests/sys/fs/fusefs/read.cc =================================================================== --- tests/sys/fs/fusefs/read.cc +++ tests/sys/fs/fusefs/read.cc @@ -294,6 +294,43 @@ leak(fd); } +/* + * Regression test for a VFS locking bug: as of + * b3c6fe663bb90240f8bda6b5ba9c6a761f09f078 (8-February-2021), fusefs did not + * obtain an exclusive vnode lock before attempting to clear the attr cache + * after an unexpected eof. The vnode lock would already be exclusive except + * when FUSE_ASYNC_READ is set. + */ +TEST_F(AsyncRead, eof) +{ + const char FULLPATH[] = "mountpoint/some_file.txt"; + const char RELPATH[] = "some_file.txt"; + const char *CONTENTS = "abcdefghijklmnop"; + uint64_t ino = 42; + int fd; + uint64_t offset = 100; + ssize_t bufsize = strlen(CONTENTS); + ssize_t partbufsize = 3 * bufsize / 4; + ssize_t r; + uint8_t buf[bufsize]; + struct stat sb; + + expect_lookup(RELPATH, ino, offset + bufsize); + expect_open(ino, 0, 1); + expect_read(ino, 0, offset + bufsize, offset + partbufsize, CONTENTS); + expect_getattr(ino, offset + partbufsize); + + fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd) << strerror(errno); + + r = pread(fd, buf, bufsize, offset); + ASSERT_LE(0, r) << strerror(errno); + EXPECT_EQ(partbufsize, r) << strerror(errno); + ASSERT_EQ(0, fstat(fd, &sb)); + EXPECT_EQ((off_t)(offset + partbufsize), sb.st_size); + leak(fd); +} + /* 0-length reads shouldn't cause any confusion */ TEST_F(Read, direct_io_read_nothing) {