Page MenuHomeFreeBSD

D20940.diff
No OneTemporary

D20940.diff

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: UPDATING
===================================================================
--- UPDATING
+++ UPDATING
@@ -31,6 +31,18 @@
disable the most expensive debugging functionality run
"ln -s 'abort:false,junk:false' /etc/malloc.conf".)
+20190627:
+ The vfs.fusefs.sync_unmount and vfs.fusefs.init_backgrounded sysctls
+ and the "-o sync_unmount" and "-o init_backgrounded" mount options have
+ been removed from mount_fusefs(8). You can safely remove them from
+ your scripts, because they had no effect.
+
+ The vfs.fusefs.fix_broken_io, vfs.fusefs.sync_resize,
+ vfs.fusefs.refresh_size, vfs.fusefs.mmap_enable,
+ vfs.fusefs.reclaim_revoked, and vfs.fusefs.data_cache_invalidate
+ sysctls have been removed. If you felt the need to set any of them to
+ a non-default value, please tell asomers@FreeBSD.org why.
+
20190620:
Entropy collection and the /dev/random device are no longer optional
components. The "device random" option has been removed.
Index: etc/mtree/BSD.tests.dist
===================================================================
--- etc/mtree/BSD.tests.dist
+++ etc/mtree/BSD.tests.dist
@@ -731,6 +731,8 @@
file
..
fs
+ fusefs
+ ..
tmpfs
..
..
Index: lib/libc/gen/getvfsbyname.c
===================================================================
--- lib/libc/gen/getvfsbyname.c
+++ lib/libc/gen/getvfsbyname.c
@@ -37,10 +37,26 @@
#include <sys/mount.h>
#include <sys/sysctl.h>
#include <errno.h>
+#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
/*
+ * fusefs(5) file systems may have a "subtype" which gets appended to
+ * statfs(2)'s f_fstypename field on a per-mount basis. Allow getvfsbyname to
+ * match either the full "fusefs.foobar" or the more general "fusefs".
+ */
+static bool
+are_fusefs(const char *fsname, const char *vfc_name)
+{
+ const static char fusefs[] = "fusefs";
+ const static char fusefs_dot[] = "fusefs.";
+
+ return (strncmp(fsname, fusefs_dot, sizeof(fusefs_dot) - 1) == 0 &&
+ strcmp(fusefs, vfc_name) == 0);
+}
+
+/*
* Given a filesystem name, determine if it is resident in the kernel,
* and if it is resident, return its xvfsconf structure.
*/
@@ -62,7 +78,8 @@
}
cnt = buflen / sizeof(struct xvfsconf);
for (i = 0; i < cnt; i++) {
- if (strcmp(fsname, xvfsp[i].vfc_name) == 0) {
+ if (strcmp(fsname, xvfsp[i].vfc_name) == 0 ||
+ are_fusefs(fsname, xvfsp[i].vfc_name)) {
memcpy(vfcp, xvfsp + i, sizeof(struct xvfsconf));
free(xvfsp);
return (0);
Index: sbin/mount_fusefs/mount_fusefs.8
===================================================================
--- sbin/mount_fusefs/mount_fusefs.8
+++ sbin/mount_fusefs/mount_fusefs.8
@@ -3,6 +3,11 @@
.\" Copyright (c) 2005, 2006 Csaba Henk
.\" All rights reserved.
.\"
+.\" Copyright (c) 2019 The FreeBSD Foundation
+.\"
+.\" Portions of this documentation were written by BFF Storage Systems under
+.\" sponsorship from the FreeBSD Foundation.
+.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
@@ -29,7 +34,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd November 17, 2018
+.Dd July 18, 2019
.Dt MOUNT_FUSEFS 8
.Os
.Sh NAME
@@ -136,23 +141,38 @@
by prefixing them with
.Dq no ) :
.Bl -tag -width indent
-.It Cm default_permissions
-Enable traditional (file mode based) permission checking in kernel
.It Cm allow_other
Do not apply
.Sx STRICT ACCESS POLICY .
Only root can use this option
+.It Cm async
+I/O to the file system may be done asynchronously.
+Writes may delayed and/or reordered.
+.It Cm default_permissions
+Enable traditional (file mode based) permission checking in kernel
+.It Cm intr
+Allow signals to interrupt operations that are blocked waiting for a reply from the server.
+When this option is in use, system calls may fail with
+.Er EINTR
+whenever a signal is received.
.It Cm max_read Ns = Ns Ar n
Limit size of read requests to
.Ar n
+.It Cm neglect_shares
+Do not refuse unmounting if there are secondary mounts
.It Cm private
Refuse shared mounting of the daemon.
This is the default behaviour, to allow sharing, expicitly use
.Fl o Cm noprivate
-.It Cm neglect_shares
-Do not refuse unmounting if there are secondary mounts
.It Cm push_symlinks_in
Prefix absolute symlinks with the mountpoint
+.It Cm subtype Ns = Ns Ar fsname
+Suffix
+.Ar fsname
+to the file system name as reported by
+.Xr statfs 2 .
+This option can be used to identify the file system implemented by
+.Ar fuse_daemon .
.El
.El
.Pp
Index: sbin/mount_fusefs/mount_fusefs.c
===================================================================
--- sbin/mount_fusefs/mount_fusefs.c
+++ sbin/mount_fusefs/mount_fusefs.c
@@ -5,6 +5,11 @@
* Copyright (c) 2005 Csaba Henk
* All rights reserved.
*
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems under
+ * sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -60,7 +65,6 @@
void usage(void);
void helpmsg(void);
void showversion(void);
-int init_backgrounded(void);
static struct mntopt mopts[] = {
#define ALTF_PRIVATE 0x01
@@ -73,8 +77,6 @@
{ "max_read=", 0, ALTF_MAXREAD, 1 },
#define ALTF_SUBTYPE 0x40
{ "subtype=", 0, ALTF_SUBTYPE, 1 },
- #define ALTF_SYNC_UNMOUNT 0x80
- { "sync_unmount", 0, ALTF_SYNC_UNMOUNT, 1 },
/*
* MOPT_AUTOMOUNTED, included by MOPT_STDOPTS, does not fit into
* the 'flags' argument to nmount(2). We have to abuse altflags
@@ -82,6 +84,8 @@
*/
#define ALTF_AUTOMOUNTED 0x100
{ "automounted", 0, ALTF_AUTOMOUNTED, 1 },
+ #define ALTF_INTR 0x200
+ { "intr", 0, ALTF_INTR, 1 },
/* Linux specific options, we silently ignore them */
{ "fsname=", 0, 0x00, 1 },
{ "fd=", 0, 0x00, 1 },
@@ -91,6 +95,8 @@
{ "large_read", 0, 0x00, 1 },
/* "nonempty", just the first two chars are stripped off during parsing */
{ "nempty", 0, 0x00, 1 },
+ { "async", 0, MNT_ASYNC, 0},
+ { "noasync", 1, MNT_ASYNC, 0},
MOPT_STDOPTS,
MOPT_END
};
@@ -107,7 +113,7 @@
{ 0, NULL, 0 }
};
-#define DEFAULT_MOUNT_FLAGS ALTF_PRIVATE | ALTF_SYNC_UNMOUNT
+#define DEFAULT_MOUNT_FLAGS ALTF_PRIVATE
int
main(int argc, char *argv[])
@@ -409,12 +415,6 @@
}
}
- if (fd >= 0 && ! init_backgrounded() && close(fd) < 0) {
- if (pid)
- kill(pid, SIGKILL);
- err(1, "failed to close fuse device");
- }
-
/* Prepare the options vector for nmount(). build_iovec() is declared
* in mntopts.h. */
sprintf(fdstr, "%d", fd);
@@ -471,6 +471,7 @@
" -o allow_other allow access to other users\n"
/* " -o nonempty allow mounts over non-empty file/dir\n" */
" -o default_permissions enable permission checking by kernel\n"
+ " -o intr interruptible mount\n"
/*
" -o fsname=NAME set filesystem name\n"
" -o large_read issue large read requests (2.4 only)\n"
@@ -481,7 +482,6 @@
" -o neglect_shares don't report EBUSY when unmount attempted\n"
" in presence of secondary mounts\n"
" -o push_symlinks_in prefix absolute symlinks with mountpoint\n"
- " -o sync_unmount do unmount synchronously\n"
);
exit(EX_USAGE);
}
@@ -491,18 +491,4 @@
{
puts("mount_fusefs [fuse4bsd] version: " FUSE4BSD_VERSION);
exit(EX_USAGE);
-}
-
-int
-init_backgrounded(void)
-{
- int ibg;
- size_t len;
-
- len = sizeof(ibg);
-
- if (sysctlbyname("vfs.fusefs.init_backgrounded", &ibg, &len, NULL, 0))
- return (0);
-
- return (ibg);
}
Index: share/man/man5/fusefs.5
===================================================================
--- share/man/man5/fusefs.5
+++ share/man/man5/fusefs.5
@@ -3,8 +3,8 @@
.\"
.\" Copyright (c) 2019 The FreeBSD Foundation
.\"
-.\" This software was developed by BFF Storage Systems, LLC under sponsorship
-.\" from the FreeBSD Foundation.
+.\" This documentation was written by BFF Storage Systems, LLC under
+.\" sponsorship from the FreeBSD Foundation.
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
@@ -28,7 +28,7 @@
.\" SUCH DAMAGE.
.\"
.\" $FreeBSD$
-.Dd April 13, 2019
+.Dd June 27, 2019
.Dt FUSEFS 5
.Os
.Sh NAME
@@ -60,11 +60,9 @@
API is portable.
Many daemons can run on multiple operating systems with minimal modifications.
.Sh SYSCTL VARIABLES
-The following variables are available as both
+The following
.Xr sysctl 8
-variables and
-.Xr loader 8
-tunables:
+variables are available:
.Bl -tag -width indent
.It Va vfs.fusefs.kernelabi_major
Major version of the FUSE kernel ABI supported by this driver.
@@ -73,7 +71,7 @@
.It Va vfs.fusefs.data_cache_mode
Controls how
.Nm
-will cache file data.
+will cache file data for pre-7.23 file systems.
A value of 0 will disable caching entirely.
Every data access will be forwarded to the daemon.
A value of 1 will select write-through caching.
@@ -84,33 +82,25 @@
to the daemon by the page daemon.
Write-back caching is usually unsafe, especially for FUSE file systems that
require network access.
-.It Va vfs.fusefs.lookup_cache_enable
-Controls whether
-.Nm
-will cache lookup responses from the file system.
-FUSE file systems indicate whether lookup responses should be cacheable, but
-it may be useful to globally disable caching them if a file system is
-misbehaving.
+.Pp
+FUSE file systems using protocol 7.23 or later specify their cache behavior
+on a per-mountpoint basis, ignoring this sysctl.
+.It Va vfs.fusefs.stats.filehandle_count
+Current number of open FUSE file handles.
+.It Va vfs.fusefs.stats.lookup_cache_hits
+Total number of lookup cache hits.
+.It Va vfs.fusefs.stats.lookup_cache_misses
+Total number of lookup cache misses.
+.It Va vfs.fusefs.stats.node_count
+Current number of allocated FUSE vnodes.
+.It Va vfs.fusefs.stats.ticket_count
+Current number of allocated FUSE tickets, which is roughly equal to the number
+number of FUSE operations currently being processed by daemons.
.\" Undocumented sysctls
.\" ====================
-.\" Counters: I intend to rename to vfs.fusefs.stats.* for clarity
-.\" vfs.fusefs.lookup_cache_{hits, misses}
-.\" vfs.fusefs.filehandle_count
-.\" vfs.fusefs.ticker_count
-.\" vfs.fusefs.node_count
-.\"
-.\" vfs.fusefs.version - useless since the driver moved in-tree
-.\" vfs.fusefs.reclaim_revoked: I don't understand it well-enough
-.\" vfs.fusefs.sync_unmount: dead code
.\" vfs.fusefs.enforce_dev_perms: I don't understand it well enough.
-.\" vfs.fusefs.init_backgrounded: dead code
.\" vfs.fusefs.iov_credit: I don't understand it well enough
.\" vfs.fusefs.iov_permanent_bufsize: I don't understand it well enough
-.\" vfs.fusefs.fix_broken_io: I don't understand it well enough
-.\" vfs.fusefs.sync_resize: useless and should be removed
-.\" vfs.fusefs.refresh_size: probably useless?
-.\" vfs.fusefs.mmap_enable: why is this optional?
-.\" vfs.fusefs.data_cache_invalidate: what is this needed for?
.Sh SEE ALSO
.Xr mount_fusefs 8
.Sh HISTORY
Index: share/man/man9/VOP_FSYNC.9
===================================================================
--- share/man/man9/VOP_FSYNC.9
+++ share/man/man9/VOP_FSYNC.9
@@ -4,6 +4,11 @@
.\"
.\" All rights reserved.
.\"
+.\" Copyright (c) 2019 The FreeBSD Foundation
+.\"
+.\" Portions of this documentation were written by BFF Storage Systems under
+.\" sponsorship from the FreeBSD Foundation.
+.\"
.\" This program is free software.
.\"
.\" Redistribution and use in source and binary forms, with or without
Index: share/mk/bsd.compiler.mk
===================================================================
--- share/mk/bsd.compiler.mk
+++ share/mk/bsd.compiler.mk
@@ -19,6 +19,7 @@
# COMPILER_FEATURES will contain one or more of the following, based on
# compiler support for that feature:
#
+# - c++14: supports full (or nearly full) C++14 programming environment.
# - c++11: supports full (or nearly full) C++11 programming environment.
# - retpoline: supports the retpoline speculative execution vulnerability
# mitigation.
@@ -200,6 +201,10 @@
.endif
${X_}COMPILER_FEATURES=
+.if ${${X_}COMPILER_TYPE} == "clang" || \
+ (${${X_}COMPILER_TYPE} == "gcc" && ${${X_}COMPILER_VERSION} >= 50000)
+${X_}COMPILER_FEATURES+= c++14
+.endif
.if ${${X_}COMPILER_TYPE} == "clang" || \
(${${X_}COMPILER_TYPE} == "gcc" && ${${X_}COMPILER_VERSION} >= 40800)
${X_}COMPILER_FEATURES+= c++11
Index: sys/fs/fuse/fuse.h
===================================================================
--- sys/fs/fuse/fuse.h
+++ sys/fs/fuse/fuse.h
@@ -32,6 +32,11 @@
*
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -63,87 +68,10 @@
#define FUSE_MIN_DAEMON_TIMEOUT 0 /* s */
#define FUSE_MAX_DAEMON_TIMEOUT 600 /* s */
-#ifndef FUSE_FREEBSD_VERSION
-#define FUSE_FREEBSD_VERSION "0.4.4"
-#endif
-
-/* Mapping versions to features */
-
-#define FUSE_KERNELABI_GEQ(maj, min) \
-(FUSE_KERNEL_VERSION > (maj) || (FUSE_KERNEL_VERSION == (maj) && FUSE_KERNEL_MINOR_VERSION >= (min)))
-
-/*
- * Appearance of new FUSE operations is not always in par with version
- * numbering... At least, 7.3 is a sufficient condition for having
- * FUSE_{ACCESS,CREATE}.
- */
-#if FUSE_KERNELABI_GEQ(7, 3)
-#ifndef FUSE_HAS_ACCESS
-#define FUSE_HAS_ACCESS 1
-#endif
-#ifndef FUSE_HAS_CREATE
-#define FUSE_HAS_CREATE 1
-#endif
-#else /* FUSE_KERNELABI_GEQ(7, 3) */
-#ifndef FUSE_HAS_ACCESS
-#define FUSE_HAS_ACCESS 0
-#endif
-#ifndef FUSE_HAS_CREATE
-#define FUSE_HAS_CREATE 0
-#endif
-#endif
-
-#if FUSE_KERNELABI_GEQ(7, 7)
-#ifndef FUSE_HAS_GETLK
-#define FUSE_HAS_GETLK 1
-#endif
-#ifndef FUSE_HAS_SETLK
-#define FUSE_HAS_SETLK 1
-#endif
-#ifndef FUSE_HAS_SETLKW
-#define FUSE_HAS_SETLKW 1
-#endif
-#ifndef FUSE_HAS_INTERRUPT
-#define FUSE_HAS_INTERRUPT 1
-#endif
-#else /* FUSE_KERNELABI_GEQ(7, 7) */
-#ifndef FUSE_HAS_GETLK
-#define FUSE_HAS_GETLK 0
-#endif
-#ifndef FUSE_HAS_SETLK
-#define FUSE_HAS_SETLK 0
-#endif
-#ifndef FUSE_HAS_SETLKW
-#define FUSE_HAS_SETLKW 0
-#endif
-#ifndef FUSE_HAS_INTERRUPT
-#define FUSE_HAS_INTERRUPT 0
-#endif
-#endif
-
-#if FUSE_KERNELABI_GEQ(7, 8)
-#ifndef FUSE_HAS_FLUSH_RELEASE
-#define FUSE_HAS_FLUSH_RELEASE 1
-/*
- * "DESTROY" came in the middle of the 7.8 era,
- * so this is not completely exact...
- */
-#ifndef FUSE_HAS_DESTROY
-#define FUSE_HAS_DESTROY 1
-#endif
-#endif
-#else /* FUSE_KERNELABI_GEQ(7, 8) */
-#ifndef FUSE_HAS_FLUSH_RELEASE
-#define FUSE_HAS_FLUSH_RELEASE 0
-#ifndef FUSE_HAS_DESTROY
-#define FUSE_HAS_DESTROY 0
-#endif
-#endif
-#endif
-
/* misc */
SYSCTL_DECL(_vfs_fusefs);
+SYSCTL_DECL(_vfs_fusefs_stats);
/* Fuse locking */
Index: sys/fs/fuse/fuse_device.c
===================================================================
--- sys/fs/fuse/fuse_device.c
+++ sys/fs/fuse/fuse_device.c
@@ -33,6 +33,11 @@
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
*
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -81,27 +86,28 @@
#include <sys/selinfo.h>
#include "fuse.h"
+#include "fuse_internal.h"
#include "fuse_ipc.h"
-SDT_PROVIDER_DECLARE(fuse);
+SDT_PROVIDER_DECLARE(fusefs);
/*
* Fuse trace probe:
* arg0: verbosity. Higher numbers give more verbose messages
* arg1: Textual message
*/
-SDT_PROBE_DEFINE2(fuse, , device, trace, "int", "char*");
+SDT_PROBE_DEFINE2(fusefs, , device, trace, "int", "char*");
static struct cdev *fuse_dev;
+static d_kqfilter_t fuse_device_filter;
static d_open_t fuse_device_open;
-static d_close_t fuse_device_close;
static d_poll_t fuse_device_poll;
static d_read_t fuse_device_read;
static d_write_t fuse_device_write;
static struct cdevsw fuse_device_cdevsw = {
+ .d_kqfilter = fuse_device_filter,
.d_open = fuse_device_open,
- .d_close = fuse_device_close,
.d_name = "fuse",
.d_poll = fuse_device_poll,
.d_read = fuse_device_read,
@@ -109,6 +115,15 @@
.d_version = D_VERSION,
};
+static int fuse_device_filt_read(struct knote *kn, long hint);
+static void fuse_device_filt_detach(struct knote *kn);
+
+struct filterops fuse_device_rfiltops = {
+ .f_isfd = 1,
+ .f_detach = fuse_device_filt_detach,
+ .f_event = fuse_device_filt_read,
+};
+
/****************************
*
* >>> Fuse device op defs
@@ -119,11 +134,100 @@
fdata_dtor(void *arg)
{
struct fuse_data *fdata;
+ struct fuse_ticket *tick;
fdata = arg;
+ if (fdata == NULL)
+ return;
+
+ fdata_set_dead(fdata);
+
+ FUSE_LOCK();
+ fuse_lck_mtx_lock(fdata->aw_mtx);
+ /* wakup poll()ers */
+ selwakeuppri(&fdata->ks_rsel, PZERO + 1);
+ /* Don't let syscall handlers wait in vain */
+ while ((tick = fuse_aw_pop(fdata))) {
+ fuse_lck_mtx_lock(tick->tk_aw_mtx);
+ fticket_set_answered(tick);
+ tick->tk_aw_errno = ENOTCONN;
+ wakeup(tick);
+ fuse_lck_mtx_unlock(tick->tk_aw_mtx);
+ FUSE_ASSERT_AW_DONE(tick);
+ fuse_ticket_drop(tick);
+ }
+ fuse_lck_mtx_unlock(fdata->aw_mtx);
+
+ /* Cleanup unsent operations */
+ fuse_lck_mtx_lock(fdata->ms_mtx);
+ while ((tick = fuse_ms_pop(fdata))) {
+ fuse_ticket_drop(tick);
+ }
+ fuse_lck_mtx_unlock(fdata->ms_mtx);
+ FUSE_UNLOCK();
+
fdata_trydestroy(fdata);
}
+static int
+fuse_device_filter(struct cdev *dev, struct knote *kn)
+{
+ struct fuse_data *data;
+ int error;
+
+ error = devfs_get_cdevpriv((void **)&data);
+
+ /* EVFILT_WRITE is not supported; the device is always ready to write */
+ if (error == 0 && kn->kn_filter == EVFILT_READ) {
+ kn->kn_fop = &fuse_device_rfiltops;
+ kn->kn_hook = data;
+ knlist_add(&data->ks_rsel.si_note, kn, 0);
+ error = 0;
+ } else if (error == 0) {
+ error = EINVAL;
+ kn->kn_data = error;
+ }
+
+ return (error);
+}
+
+static void
+fuse_device_filt_detach(struct knote *kn)
+{
+ struct fuse_data *data;
+
+ data = (struct fuse_data*)kn->kn_hook;
+ MPASS(data != NULL);
+ knlist_remove(&data->ks_rsel.si_note, kn, 0);
+ kn->kn_hook = NULL;
+}
+
+static int
+fuse_device_filt_read(struct knote *kn, long hint)
+{
+ struct fuse_data *data;
+ int ready;
+
+ data = (struct fuse_data*)kn->kn_hook;
+ MPASS(data != NULL);
+
+ mtx_assert(&data->ms_mtx, MA_OWNED);
+ if (fdata_get_dead(data)) {
+ kn->kn_flags |= EV_EOF;
+ kn->kn_fflags = ENODEV;
+ kn->kn_data = 1;
+ ready = 1;
+ } else if (STAILQ_FIRST(&data->ms_head)) {
+ MPASS(data->ms_count >= 1);
+ kn->kn_data = data->ms_count;
+ ready = 1;
+ } else {
+ ready = 0;
+ }
+
+ return (ready);
+}
+
/*
* Resources are set up on a per-open basis
*/
@@ -133,52 +237,17 @@
struct fuse_data *fdata;
int error;
- SDT_PROBE2(fuse, , device, trace, 1, "device open");
+ SDT_PROBE2(fusefs, , device, trace, 1, "device open");
fdata = fdata_alloc(dev, td->td_ucred);
error = devfs_set_cdevpriv(fdata, fdata_dtor);
if (error != 0)
fdata_trydestroy(fdata);
else
- SDT_PROBE2(fuse, , device, trace, 1, "device open success");
+ SDT_PROBE2(fusefs, , device, trace, 1, "device open success");
return (error);
}
-static int
-fuse_device_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
-{
- struct fuse_data *data;
- struct fuse_ticket *tick;
- int error;
-
- error = devfs_get_cdevpriv((void **)&data);
- if (error != 0)
- return (error);
- if (!data)
- panic("no fuse data upon fuse device close");
- fdata_set_dead(data);
-
- FUSE_LOCK();
- fuse_lck_mtx_lock(data->aw_mtx);
- /* wakup poll()ers */
- selwakeuppri(&data->ks_rsel, PZERO + 1);
- /* Don't let syscall handlers wait in vain */
- while ((tick = fuse_aw_pop(data))) {
- fuse_lck_mtx_lock(tick->tk_aw_mtx);
- fticket_set_answered(tick);
- tick->tk_aw_errno = ENOTCONN;
- wakeup(tick);
- fuse_lck_mtx_unlock(tick->tk_aw_mtx);
- FUSE_ASSERT_AW_DONE(tick);
- fuse_ticket_drop(tick);
- }
- fuse_lck_mtx_unlock(data->aw_mtx);
- FUSE_UNLOCK();
-
- SDT_PROBE2(fuse, , device, trace, 1, "device close");
- return (0);
-}
-
int
fuse_device_poll(struct cdev *dev, int events, struct thread *td)
{
@@ -219,7 +288,7 @@
int buflen[3];
int i;
- SDT_PROBE2(fuse, , device, trace, 1, "fuse device read");
+ SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read");
err = devfs_get_cdevpriv((void **)&data);
if (err != 0)
@@ -228,7 +297,7 @@
fuse_lck_mtx_lock(data->ms_mtx);
again:
if (fdata_get_dead(data)) {
- SDT_PROBE2(fuse, , device, trace, 2,
+ SDT_PROBE2(fusefs, , device, trace, 2,
"we know early on that reader should be kicked so we "
"don't wait for news");
fuse_lck_mtx_unlock(data->ms_mtx);
@@ -256,7 +325,7 @@
* -- and some other cases, too, tho not totally clear, when
* (cv_signal/wakeup_one signals the whole process ?)
*/
- SDT_PROBE2(fuse, , device, trace, 1, "no message on thread");
+ SDT_PROBE2(fusefs, , device, trace, 1, "no message on thread");
goto again;
}
fuse_lck_mtx_unlock(data->ms_mtx);
@@ -266,9 +335,10 @@
* somebody somewhere -- eg., umount routine --
* wants this liaison finished off
*/
- SDT_PROBE2(fuse, , device, trace, 2, "reader is to be sacked");
+ SDT_PROBE2(fusefs, , device, trace, 2,
+ "reader is to be sacked");
if (tick) {
- SDT_PROBE2(fuse, , device, trace, 2, "weird -- "
+ SDT_PROBE2(fusefs, , device, trace, 2, "weird -- "
"\"kick\" is set tho there is message");
FUSE_ASSERT_MS_DONE(tick);
fuse_ticket_drop(tick);
@@ -276,7 +346,7 @@
return (ENODEV); /* This should make the daemon get off
* of us */
}
- SDT_PROBE2(fuse, , device, trace, 1,
+ SDT_PROBE2(fusefs, , device, trace, 1,
"fuse device read message successfully");
KASSERT(tick->tk_ms_bufdata || tick->tk_ms_bufsize == 0,
@@ -311,7 +381,7 @@
*/
if (uio->uio_resid < buflen[i]) {
fdata_set_dead(data);
- SDT_PROBE2(fuse, , device, trace, 2,
+ SDT_PROBE2(fusefs, , device, trace, 2,
"daemon is stupid, kick it off...");
err = ENODEV;
break;
@@ -331,23 +401,26 @@
fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio)
{
if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) {
- SDT_PROBE2(fuse, , device, trace, 1, "Format error: body size "
+ SDT_PROBE2(fusefs, , device, trace, 1,
+ "Format error: body size "
"differs from size claimed by header");
return (EINVAL);
}
- if (uio->uio_resid && ohead->error) {
- SDT_PROBE2(fuse, , device, trace, 1,
+ if (uio->uio_resid && ohead->unique != 0 && ohead->error) {
+ SDT_PROBE2(fusefs, , device, trace, 1,
"Format error: non zero error but message had a body");
return (EINVAL);
}
- /* Sanitize the linuxism of negative errnos */
- ohead->error = -(ohead->error);
return (0);
}
-SDT_PROBE_DEFINE1(fuse, , device, fuse_device_write_bumped_into_callback,
- "uint64_t");
+SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_notify,
+ "struct fuse_out_header*");
+SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_missing_ticket,
+ "uint64_t");
+SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_found,
+ "struct fuse_ticket*");
/*
* fuse_device_write first reads the header sent by the daemon.
* If that's OK, looks up ticket/callback node by the unique id seen in header.
@@ -360,15 +433,17 @@
struct fuse_out_header ohead;
int err = 0;
struct fuse_data *data;
- struct fuse_ticket *tick, *x_tick;
+ struct mount *mp;
+ struct fuse_ticket *tick, *itick, *x_tick;
int found = 0;
err = devfs_get_cdevpriv((void **)&data);
if (err != 0)
return (err);
+ mp = data->mp;
if (uio->uio_resid < sizeof(struct fuse_out_header)) {
- SDT_PROBE2(fuse, , device, trace, 1,
+ SDT_PROBE2(fusefs, , device, trace, 1,
"fuse_device_write got less than a header!");
fdata_set_dead(data);
return (EINVAL);
@@ -393,15 +468,29 @@
fuse_lck_mtx_lock(data->aw_mtx);
TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link,
x_tick) {
- SDT_PROBE1(fuse, , device,
- fuse_device_write_bumped_into_callback,
- tick->tk_unique);
if (tick->tk_unique == ohead.unique) {
+ SDT_PROBE1(fusefs, , device, fuse_device_write_found,
+ tick);
found = 1;
fuse_aw_remove(tick);
break;
}
}
+ if (found && tick->irq_unique > 0) {
+ /*
+ * Discard the FUSE_INTERRUPT ticket that tried to interrupt
+ * this operation
+ */
+ TAILQ_FOREACH_SAFE(itick, &data->aw_head, tk_aw_link,
+ x_tick) {
+ if (itick->tk_unique == tick->irq_unique) {
+ fuse_aw_remove(itick);
+ fuse_ticket_drop(itick);
+ break;
+ }
+ }
+ tick->irq_unique = 0;
+ }
fuse_lck_mtx_unlock(data->aw_mtx);
if (found) {
@@ -414,13 +503,15 @@
* via ticket_drop(), so no manual mucking
* around...)
*/
- SDT_PROBE2(fuse, , device, trace, 1,
+ SDT_PROBE2(fusefs, , device, trace, 1,
"pass ticket to a callback");
+ /* Sanitize the linuxism of negative errnos */
+ ohead.error *= -1;
memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead));
err = tick->tk_aw_handler(tick, uio);
} else {
/* pretender doesn't wanna do anything with answer */
- SDT_PROBE2(fuse, , device, trace, 1,
+ SDT_PROBE2(fusefs, , device, trace, 1,
"stuff devalidated, so we drop it");
}
@@ -430,11 +521,51 @@
* because fuse_ticket_drop() will deal with refcount anyway.
*/
fuse_ticket_drop(tick);
+ } else if (ohead.unique == 0){
+ /* unique == 0 means asynchronous notification */
+ SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead);
+ switch (ohead.error) {
+ case FUSE_NOTIFY_INVAL_ENTRY:
+ err = fuse_internal_invalidate_entry(mp, uio);
+ break;
+ case FUSE_NOTIFY_INVAL_INODE:
+ err = fuse_internal_invalidate_inode(mp, uio);
+ break;
+ case FUSE_NOTIFY_RETRIEVE:
+ case FUSE_NOTIFY_STORE:
+ /*
+ * Unimplemented. I don't know of any file systems
+ * that use them, and the protocol isn't sound anyway,
+ * since the notification messages don't include the
+ * inode's generation number. Without that, it's
+ * possible to manipulate the cache of the wrong vnode.
+ * Finally, it's not defined what this message should
+ * do for a file with dirty cache.
+ */
+ case FUSE_NOTIFY_POLL:
+ /* Unimplemented. See comments in fuse_vnops */
+ default:
+ /* Not implemented */
+ err = ENOSYS;
+ }
} else {
/* no callback at all! */
- SDT_PROBE2(fuse, , device, trace, 1,
- "erhm, no handler for this response");
- err = EINVAL;
+ SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket,
+ ohead.unique);
+ if (ohead.error == -EAGAIN) {
+ /*
+ * This was probably a response to a FUSE_INTERRUPT
+ * operation whose original operation is already
+ * complete. We can't store FUSE_INTERRUPT tickets
+ * indefinitely because their responses are optional.
+ * So we delete them when the original operation
+ * completes. And sadly the fuse_header_out doesn't
+ * identify the opcode, so we have to guess.
+ */
+ err = 0;
+ } else {
+ err = EINVAL;
+ }
}
return (err);
@@ -445,7 +576,7 @@
{
fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR,
- S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP, "fuse");
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, "fuse");
if (fuse_dev == NULL)
return (ENOMEM);
return (0);
Index: sys/fs/fuse/fuse_file.h
===================================================================
--- sys/fs/fuse/fuse_file.h
+++ sys/fs/fuse/fuse_file.h
@@ -32,6 +32,11 @@
*
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -66,52 +71,115 @@
#include <sys/mman.h>
#include <sys/vnode.h>
+/*
+ * The fufh type is the access mode of the fuse file handle. It's the portion
+ * of the open(2) flags related to permission.
+ */
typedef enum fufh_type {
FUFH_INVALID = -1,
- FUFH_RDONLY = 0,
- FUFH_WRONLY = 1,
- FUFH_RDWR = 2,
- FUFH_MAXTYPE = 3,
+ FUFH_RDONLY = O_RDONLY,
+ FUFH_WRONLY = O_WRONLY,
+ FUFH_RDWR = O_RDWR,
+ FUFH_EXEC = O_EXEC,
} fufh_type_t;
-_Static_assert(FUFH_RDONLY == O_RDONLY, "RDONLY");
-_Static_assert(FUFH_WRONLY == O_WRONLY, "WRONLY");
-_Static_assert(FUFH_RDWR == O_RDWR, "RDWR");
+/*
+ * FUSE File Handles
+ *
+ * The FUSE protocol says that a server may assign a unique 64-bit file handle
+ * every time that a file is opened. Effectively, that's once for each file
+ * descriptor.
+ *
+ * Unfortunately, the VFS doesn't help us here. VOPs don't have a
+ * struct file* argument. fileops do, but many syscalls bypass the fileops
+ * layer and go straight to a vnode. Some, like writing from cache, can't
+ * track a file handle even in theory. The entire concept of the file handle
+ * is a product of FUSE's Linux origins; Linux lacks vnodes and almost every
+ * file system operation takes a struct file* argument.
+ *
+ * Since FreeBSD's VFS is more file descriptor-agnostic, we must store FUSE
+ * filehandles in the vnode. One option would be to only store a single file
+ * handle and never open FUSE files concurrently. That's what NetBSD does.
+ * But that violates FUSE's security model. FUSE expects the server to do all
+ * authorization (except when mounted with -o default_permissions). In order
+ * to do that, the server needs us to send FUSE_OPEN every time somebody opens
+ * a new file descriptor.
+ *
+ * Another option would be to never open FUSE files concurrently, but send a
+ * FUSE_ACCESS prior to every open after the first. That would give the server
+ * the opportunity to authorize the access. Unfortunately, the FUSE protocol
+ * makes ACCESS optional. File systems that don't implement it are assumed to
+ * authorize everything. A survey of 32 fuse file systems showed that only 14
+ * implemented access. Among the laggards were a few that really ought to be
+ * doing server-side authorization.
+ *
+ * So we do something hacky, similar to what OpenBSD, Illumos, and OSXFuse do.
+ * we store a list of file handles, one for each combination of vnode, uid,
+ * gid, pid, and access mode. When opening a file, we first check whether
+ * there's already a matching file handle. If so, we reuse it. If not, we
+ * send FUSE_OPEN and create a new file handle. That minimizes the number of
+ * open file handles while still allowing the server to authorize stuff.
+ *
+ * VOPs that need a file handle search through the list for a close match.
+ * They can't be guaranteed of finding an exact match because, for example, a
+ * process may have changed its UID since opening the file. Also, most VOPs
+ * don't know exactly what permission they need. Is O_RDWR required or is
+ * O_RDONLY good enough? So the file handle we end up using may not be exactly
+ * the one we're supposed to use with that file descriptor. But if the FUSE
+ * file system isn't too picky, it will work. (FWIW even Linux sometimes
+ * guesses the file handle, during writes from cache or most SETATTR
+ * operations).
+ *
+ * I suspect this mess is part of the reason why neither NFS nor 9P have an
+ * equivalent of FUSE file handles.
+ */
struct fuse_filehandle {
+ LIST_ENTRY(fuse_filehandle) next;
+
+ /* The filehandle returned by FUSE_OPEN */
uint64_t fh_id;
- fufh_type_t fh_type;
-};
-#define FUFH_IS_VALID(f) ((f)->fh_type != FUFH_INVALID)
+ /*
+ * flags returned by FUSE_OPEN
+ * Supported flags: FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE
+ * Unsupported:
+ * FOPEN_NONSEEKABLE: Adding support would require a new per-file
+ * or per-vnode attribute, which would have to be checked by
+ * kern_lseek (and others) for every file system. The benefit is
+ * dubious, since I'm unaware of any file systems in ports that use
+ * this flag.
+ */
+ uint32_t fuse_open_flags;
-static inline fufh_type_t
-fuse_filehandle_xlate_from_mmap(int fflags)
-{
- if (fflags & (PROT_READ | PROT_WRITE))
- return FUFH_RDWR;
- else if (fflags & (PROT_WRITE))
- return FUFH_WRONLY;
- else if ((fflags & PROT_READ) || (fflags & PROT_EXEC))
- return FUFH_RDONLY;
- else
- return FUFH_INVALID;
-}
+ /* The access mode of the file handle */
+ fufh_type_t fufh_type;
-static inline fufh_type_t
-fuse_filehandle_xlate_from_fflags(int fflags)
-{
- if ((fflags & FREAD) && (fflags & FWRITE))
- return FUFH_RDWR;
- else if (fflags & (FWRITE))
- return FUFH_WRONLY;
- else if (fflags & (FREAD))
- return FUFH_RDONLY;
- else
- panic("FUSE: What kind of a flag is this (%x)?", fflags);
-}
+ /* Credentials used to open the file */
+ gid_t gid;
+ pid_t pid;
+ uid_t uid;
+};
+#define FUFH_IS_VALID(f) ((f)->fufh_type != FUFH_INVALID)
+
+/*
+ * Get the flags to use for FUSE_CREATE, FUSE_OPEN and FUSE_RELEASE
+ *
+ * These are supposed to be the same as the flags argument to open(2).
+ * However, since we can't reliably associate a fuse_filehandle with a specific
+ * file descriptor it would would be dangerous to include anything more than
+ * the access mode flags. For example, suppose we open a file twice, once with
+ * O_APPEND and once without. Then the user pwrite(2)s to offset using the
+ * second file descriptor. If fusefs uses the first file handle, then the
+ * server may append the write to the end of the file rather than at offset 0.
+ * To prevent problems like this, we only ever send the portion of flags
+ * related to access mode.
+ *
+ * It's essential to send that portion, because FUSE uses it for server-side
+ * authorization.
+ */
static inline int
-fuse_filehandle_xlate_to_oflags(fufh_type_t type)
+fufh_type_2_fflags(fufh_type_t type)
{
int oflags = -1;
@@ -119,6 +187,7 @@
case FUFH_RDONLY:
case FUFH_WRONLY:
case FUFH_RDWR:
+ case FUFH_EXEC:
oflags = type;
break;
default:
@@ -128,19 +197,28 @@
return oflags;
}
-int fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type);
-fufh_type_t fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type);
-int fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type,
- struct fuse_filehandle **fufhp);
-int fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type,
- struct fuse_filehandle **fufhp);
+bool fuse_filehandle_validrw(struct vnode *vp, int mode,
+ struct ucred *cred, pid_t pid);
+int fuse_filehandle_get(struct vnode *vp, int fflag,
+ struct fuse_filehandle **fufhp, struct ucred *cred,
+ pid_t pid);
+int fuse_filehandle_get_anyflags(struct vnode *vp,
+ struct fuse_filehandle **fufhp, struct ucred *cred,
+ pid_t pid);
+int fuse_filehandle_getrw(struct vnode *vp, int fflag,
+ struct fuse_filehandle **fufhp, struct ucred *cred,
+ pid_t pid);
void fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type,
- struct fuse_filehandle **fufhp, uint64_t fh_id);
-int fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type,
+ struct fuse_filehandle **fufhp, struct thread *td,
+ struct ucred *cred, struct fuse_open_out *foo);
+int fuse_filehandle_open(struct vnode *vp, int mode,
struct fuse_filehandle **fufhp, struct thread *td,
struct ucred *cred);
-int fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type,
+int fuse_filehandle_close(struct vnode *vp, struct fuse_filehandle *fufh,
struct thread *td, struct ucred *cred);
+
+void fuse_file_init(void);
+void fuse_file_destroy(void);
#endif /* _FUSE_FILE_H_ */
Index: sys/fs/fuse/fuse_file.c
===================================================================
--- sys/fs/fuse/fuse_file.c
+++ sys/fs/fuse/fuse_file.c
@@ -33,6 +33,11 @@
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
*
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -59,8 +64,9 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
-#include <sys/module.h>
#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/module.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/conf.h>
@@ -79,52 +85,61 @@
#include "fuse.h"
#include "fuse_file.h"
#include "fuse_internal.h"
+#include "fuse_io.h"
#include "fuse_ipc.h"
#include "fuse_node.h"
-SDT_PROVIDER_DECLARE(fuse);
+MALLOC_DEFINE(M_FUSE_FILEHANDLE, "fuse_filefilehandle", "FUSE file handle");
+
+SDT_PROVIDER_DECLARE(fusefs);
/*
* Fuse trace probe:
* arg0: verbosity. Higher numbers give more verbose messages
* arg1: Textual message
*/
-SDT_PROBE_DEFINE2(fuse, , file, trace, "int", "char*");
+SDT_PROBE_DEFINE2(fusefs, , file, trace, "int", "char*");
-static int fuse_fh_count = 0;
+static counter_u64_t fuse_fh_count;
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, filehandle_count, CTLFLAG_RD,
- &fuse_fh_count, 0, "number of open FUSE filehandles");
+SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, filehandle_count, CTLFLAG_RD,
+ &fuse_fh_count, "number of open FUSE filehandles");
+/* Get the FUFH type for a particular access mode */
+static inline fufh_type_t
+fflags_2_fufh_type(int fflags)
+{
+ if ((fflags & FREAD) && (fflags & FWRITE))
+ return FUFH_RDWR;
+ else if (fflags & (FWRITE))
+ return FUFH_WRONLY;
+ else if (fflags & (FREAD))
+ return FUFH_RDONLY;
+ else if (fflags & (FEXEC))
+ return FUFH_EXEC;
+ else
+ panic("FUSE: What kind of a flag is this (%x)?", fflags);
+}
+
int
-fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type,
+fuse_filehandle_open(struct vnode *vp, int a_mode,
struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred)
{
struct fuse_dispatcher fdi;
struct fuse_open_in *foi;
struct fuse_open_out *foo;
+ fufh_type_t fufh_type;
int err = 0;
int oflags = 0;
int op = FUSE_OPEN;
- if (fuse_filehandle_valid(vp, fufh_type)) {
- panic("FUSE: filehandle_open called despite valid fufh (type=%d)",
- fufh_type);
- /* NOTREACHED */
- }
- /*
- * Note that this means we are effectively FILTERING OUT open() flags.
- */
- oflags = fuse_filehandle_xlate_to_oflags(fufh_type);
+ fufh_type = fflags_2_fufh_type(a_mode);
+ oflags = fufh_type_2_fflags(fufh_type);
if (vnode_isdir(vp)) {
op = FUSE_OPENDIR;
- if (fufh_type != FUFH_RDONLY) {
- SDT_PROBE2(fuse, , file, trace, 1,
- "non-rdonly fh requested for a directory?");
- printf("FUSE:non-rdonly fh requested for a directory?\n");
- fufh_type = FUFH_RDONLY;
- }
+ /* vn_open_vnode already rejects FWRITE on directories */
+ MPASS(fufh_type == FUFH_RDONLY || fufh_type == FUFH_EXEC);
}
fdisp_init(&fdi, sizeof(*foi));
fdisp_make_vp(&fdi, op, vp, td, cred);
@@ -133,7 +148,7 @@
foi->flags = oflags;
if ((err = fdisp_wait_answ(&fdi))) {
- SDT_PROBE2(fuse, , file, trace, 1,
+ SDT_PROBE2(fusefs, , file, trace, 1,
"OUCH ... daemon didn't give fh");
if (err == ENOENT) {
fuse_internal_vnode_disappear(vp);
@@ -142,42 +157,24 @@
}
foo = fdi.answ;
- fuse_filehandle_init(vp, fufh_type, fufhp, foo->fh);
+ fuse_filehandle_init(vp, fufh_type, fufhp, td, cred, foo);
+ fuse_vnode_open(vp, foo->open_flags, td);
- /*
- * For WRONLY opens, force DIRECT_IO. This is necessary
- * since writing a partial block through the buffer cache
- * will result in a read of the block and that read won't
- * be allowed by the WRONLY open.
- */
- if (fufh_type == FUFH_WRONLY)
- fuse_vnode_open(vp, foo->open_flags | FOPEN_DIRECT_IO, td);
- else
- fuse_vnode_open(vp, foo->open_flags, td);
-
out:
fdisp_destroy(&fdi);
return err;
}
int
-fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type,
+fuse_filehandle_close(struct vnode *vp, struct fuse_filehandle *fufh,
struct thread *td, struct ucred *cred)
{
struct fuse_dispatcher fdi;
struct fuse_release_in *fri;
- struct fuse_vnode_data *fvdat = VTOFUD(vp);
- struct fuse_filehandle *fufh = NULL;
int err = 0;
int op = FUSE_RELEASE;
- fufh = &(fvdat->fufh[fufh_type]);
- if (!FUFH_IS_VALID(fufh)) {
- panic("FUSE: filehandle_put called on invalid fufh (type=%d)",
- fufh_type);
- /* NOTREACHED */
- }
if (fuse_isdeadfs(vp)) {
goto out;
}
@@ -187,96 +184,194 @@
fdisp_make_vp(&fdi, op, vp, td, cred);
fri = fdi.indata;
fri->fh = fufh->fh_id;
- fri->flags = fuse_filehandle_xlate_to_oflags(fufh_type);
+ fri->flags = fufh_type_2_fflags(fufh->fufh_type);
+ /*
+ * If the file has a POSIX lock then we're supposed to set lock_owner.
+ * If not, then lock_owner is undefined. So we may as well always set
+ * it.
+ */
+ fri->lock_owner = td->td_proc->p_pid;
err = fdisp_wait_answ(&fdi);
fdisp_destroy(&fdi);
out:
- atomic_subtract_acq_int(&fuse_fh_count, 1);
- fufh->fh_id = (uint64_t)-1;
- fufh->fh_type = FUFH_INVALID;
+ counter_u64_add(fuse_fh_count, -1);
+ LIST_REMOVE(fufh, next);
+ free(fufh, M_FUSE_FILEHANDLE);
return err;
}
-int
-fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type)
-{
- struct fuse_vnode_data *fvdat = VTOFUD(vp);
- struct fuse_filehandle *fufh;
-
- fufh = &(fvdat->fufh[fufh_type]);
- return FUFH_IS_VALID(fufh);
-}
-
/*
* Check for a valid file handle, first the type requested, but if that
* isn't valid, try for FUFH_RDWR.
- * Return the FUFH type that is valid or FUFH_INVALID if there are none.
- * This is a variant of fuse_filehandle_vaild() analogous to
- * fuse_filehandle_getrw().
+ * Return true if there is any file handle with the correct credentials and
+ * a fufh type that includes the provided one.
+ * A pid of 0 means "don't care"
*/
-fufh_type_t
-fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type)
+bool
+fuse_filehandle_validrw(struct vnode *vp, int mode,
+ struct ucred *cred, pid_t pid)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
struct fuse_filehandle *fufh;
+ fufh_type_t fufh_type = fflags_2_fufh_type(mode);
- fufh = &fvdat->fufh[fufh_type];
- if (FUFH_IS_VALID(fufh) != 0)
- return (fufh_type);
- fufh = &fvdat->fufh[FUFH_RDWR];
- if (FUFH_IS_VALID(fufh) != 0)
- return (FUFH_RDWR);
- return (FUFH_INVALID);
+ /*
+ * Unlike fuse_filehandle_get, we want to search for a filehandle with
+ * the exact cred, and no fallback
+ */
+ LIST_FOREACH(fufh, &fvdat->handles, next) {
+ if (fufh->fufh_type == fufh_type &&
+ fufh->uid == cred->cr_uid &&
+ fufh->gid == cred->cr_rgid &&
+ (pid == 0 || fufh->pid == pid))
+ return true;
+ }
+
+ if (fufh_type == FUFH_EXEC)
+ return false;
+
+ /* Fallback: find a RDWR list entry with the right cred */
+ LIST_FOREACH(fufh, &fvdat->handles, next) {
+ if (fufh->fufh_type == FUFH_RDWR &&
+ fufh->uid == cred->cr_uid &&
+ fufh->gid == cred->cr_rgid &&
+ (pid == 0 || fufh->pid == pid))
+ return true;
+ }
+
+ return false;
}
int
-fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type,
- struct fuse_filehandle **fufhp)
+fuse_filehandle_get(struct vnode *vp, int fflag,
+ struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
struct fuse_filehandle *fufh;
+ fufh_type_t fufh_type;
- fufh = &(fvdat->fufh[fufh_type]);
- if (!FUFH_IS_VALID(fufh))
+ fufh_type = fflags_2_fufh_type(fflag);
+ /* cred can be NULL for in-kernel clients */
+ if (cred == NULL)
+ goto fallback;
+
+ LIST_FOREACH(fufh, &fvdat->handles, next) {
+ if (fufh->fufh_type == fufh_type &&
+ fufh->uid == cred->cr_uid &&
+ fufh->gid == cred->cr_rgid &&
+ (pid == 0 || fufh->pid == pid))
+ goto found;
+ }
+
+fallback:
+ /* Fallback: find a list entry with the right flags */
+ LIST_FOREACH(fufh, &fvdat->handles, next) {
+ if (fufh->fufh_type == fufh_type)
+ break;
+ }
+
+ if (fufh == NULL)
return EBADF;
+
+found:
if (fufhp != NULL)
*fufhp = fufh;
return 0;
}
+/* Get a file handle with any kind of flags */
int
-fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type,
- struct fuse_filehandle **fufhp)
+fuse_filehandle_get_anyflags(struct vnode *vp,
+ struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
struct fuse_filehandle *fufh;
- fufh = &(fvdat->fufh[fufh_type]);
- if (!FUFH_IS_VALID(fufh)) {
- fufh_type = FUFH_RDWR;
+ if (cred == NULL)
+ goto fallback;
+
+ LIST_FOREACH(fufh, &fvdat->handles, next) {
+ if (fufh->uid == cred->cr_uid &&
+ fufh->gid == cred->cr_rgid &&
+ (pid == 0 || fufh->pid == pid))
+ goto found;
}
- return fuse_filehandle_get(vp, fufh_type, fufhp);
+
+fallback:
+ /* Fallback: find any list entry */
+ fufh = LIST_FIRST(&fvdat->handles);
+
+ if (fufh == NULL)
+ return EBADF;
+
+found:
+ if (fufhp != NULL)
+ *fufhp = fufh;
+ return 0;
}
+int
+fuse_filehandle_getrw(struct vnode *vp, int fflag,
+ struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid)
+{
+ int err;
+
+ err = fuse_filehandle_get(vp, fflag, fufhp, cred, pid);
+ if (err)
+ err = fuse_filehandle_get(vp, FREAD | FWRITE, fufhp, cred, pid);
+ return err;
+}
+
void
fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type,
- struct fuse_filehandle **fufhp, uint64_t fh_id)
+ struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred,
+ struct fuse_open_out *foo)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
struct fuse_filehandle *fufh;
- fufh = &(fvdat->fufh[fufh_type]);
- MPASS(!FUFH_IS_VALID(fufh));
- fufh->fh_id = fh_id;
- fufh->fh_type = fufh_type;
+ fufh = malloc(sizeof(struct fuse_filehandle), M_FUSE_FILEHANDLE,
+ M_WAITOK);
+ MPASS(fufh != NULL);
+ fufh->fh_id = foo->fh;
+ fufh->fufh_type = fufh_type;
+ fufh->gid = cred->cr_rgid;
+ fufh->uid = cred->cr_uid;
+ fufh->pid = td->td_proc->p_pid;
+ fufh->fuse_open_flags = foo->open_flags;
if (!FUFH_IS_VALID(fufh)) {
panic("FUSE: init: invalid filehandle id (type=%d)", fufh_type);
}
+ LIST_INSERT_HEAD(&fvdat->handles, fufh, next);
if (fufhp != NULL)
*fufhp = fufh;
- atomic_add_acq_int(&fuse_fh_count, 1);
+ counter_u64_add(fuse_fh_count, 1);
+
+ if (foo->open_flags & FOPEN_DIRECT_IO) {
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ VTOFUD(vp)->flag |= FN_DIRECTIO;
+ fuse_io_invalbuf(vp, td);
+ } else {
+ if ((foo->open_flags & FOPEN_KEEP_CACHE) == 0)
+ fuse_io_invalbuf(vp, td);
+ VTOFUD(vp)->flag &= ~FN_DIRECTIO;
+ }
+
+}
+
+void
+fuse_file_init(void)
+{
+ fuse_fh_count = counter_u64_alloc(M_WAITOK);
+ counter_u64_zero(fuse_fh_count);
+}
+
+void
+fuse_file_destroy(void)
+{
+ counter_u64_free(fuse_fh_count);
}
Index: sys/fs/fuse/fuse_internal.h
===================================================================
--- sys/fs/fuse/fuse_internal.h
+++ sys/fs/fuse/fuse_internal.h
@@ -32,6 +32,11 @@
*
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -61,6 +66,7 @@
#define _FUSE_INTERNAL_H_
#include <sys/types.h>
+#include <sys/counter.h>
#include <sys/uio.h>
#include <sys/stat.h>
#include <sys/vnode.h>
@@ -68,6 +74,9 @@
#include "fuse_ipc.h"
#include "fuse_node.h"
+extern counter_u64_t fuse_lookup_cache_hits;
+extern counter_u64_t fuse_lookup_cache_misses;
+
static inline bool
vfs_isrdonly(struct mount *mp)
{
@@ -80,12 +89,6 @@
return (vp->v_mount);
}
-static inline bool
-vnode_mountedhere(struct vnode *vp)
-{
- return (vp->v_mountedhere != NULL);
-}
-
static inline enum vtype
vnode_vtype(struct vnode *vp)
{
@@ -134,12 +137,6 @@
uio->uio_offset = offset;
}
-static inline void
-uio_setresid(struct uio *uio, ssize_t resid)
-{
- uio->uio_resid = resid;
-}
-
/* miscellaneous */
static inline bool
@@ -156,25 +153,57 @@
return (vp->v_mount->mnt_stat.f_iosize);
}
-/* access */
+/*
+ * Make a cacheable timeout in bintime format value based on a fuse_attr_out
+ * response
+ */
+static inline void
+fuse_validity_2_bintime(uint64_t attr_valid, uint32_t attr_valid_nsec,
+ struct bintime *timeout)
+{
+ struct timespec now, duration, timeout_ts;
-#define FVP_ACCESS_NOOP 0x01
+ getnanouptime(&now);
+ /* "+ 2" is the bound of attr_valid_nsec + now.tv_nsec */
+ /* Why oh why isn't there a TIME_MAX defined? */
+ if (attr_valid >= INT_MAX || attr_valid + now.tv_sec + 2 >= INT_MAX) {
+ timeout->sec = INT_MAX;
+ } else {
+ duration.tv_sec = attr_valid;
+ duration.tv_nsec = attr_valid_nsec;
+ timespecadd(&duration, &now, &timeout_ts);
+ timespec2bintime(&timeout_ts, timeout);
+ }
+}
-#define FACCESS_VA_VALID 0x01
-#define FACCESS_DO_ACCESS 0x02
-#define FACCESS_STICKY 0x04
-#define FACCESS_CHOWN 0x08
-#define FACCESS_NOCHECKSPY 0x10
-#define FACCESS_SETGID 0x12
+/*
+ * Make a cacheable timeout value in timespec format based on the fuse_entry_out
+ * response
+ */
+static inline void
+fuse_validity_2_timespec(const struct fuse_entry_out *feo,
+ struct timespec *timeout)
+{
+ struct timespec duration, now;
-#define FACCESS_XQUERIES (FACCESS_STICKY | FACCESS_CHOWN | FACCESS_SETGID)
+ getnanouptime(&now);
+ /* "+ 2" is the bound of entry_valid_nsec + now.tv_nsec */
+ if (feo->entry_valid >= INT_MAX ||
+ feo->entry_valid + now.tv_sec + 2 >= INT_MAX) {
+ timeout->tv_sec = INT_MAX;
+ } else {
+ duration.tv_sec = feo->entry_valid;
+ duration.tv_nsec = feo->entry_valid_nsec;
+ timespecadd(&duration, &now, timeout);
+ }
+}
-struct fuse_access_param {
- uid_t xuid;
- gid_t xgid;
- uint32_t facc_flags;
-};
+/* VFS ops */
+int
+fuse_internal_get_cached_vnode(struct mount*, ino_t, int, struct vnode**);
+
+/* access */
static inline int
fuse_match_cred(struct ucred *basecred, struct ucred *usercred)
{
@@ -189,8 +218,8 @@
return (EPERM);
}
-int fuse_internal_access(struct vnode *vp, mode_t mode,
- struct fuse_access_param *facp, struct thread *td, struct ucred *cred);
+int fuse_internal_access(struct vnode *vp, accmode_t mode,
+ struct thread *td, struct ucred *cred);
/* attributes */
void fuse_internal_cache_attrs(struct vnode *vp, struct fuse_attr *attr,
@@ -198,21 +227,35 @@
/* fsync */
-int fuse_internal_fsync(struct vnode *vp, struct thread *td,
- struct ucred *cred, struct fuse_filehandle *fufh);
+int fuse_internal_fsync(struct vnode *vp, struct thread *td, int waitfor,
+ bool datasync);
int fuse_internal_fsync_callback(struct fuse_ticket *tick, struct uio *uio);
-/* readdir */
+/* getattr */
+int fuse_internal_do_getattr(struct vnode *vp, struct vattr *vap,
+ struct ucred *cred, struct thread *td);
+int fuse_internal_getattr(struct vnode *vp, struct vattr *vap,
+ struct ucred *cred, struct thread *td);
+/* asynchronous invalidation */
+int fuse_internal_invalidate_entry(struct mount *mp, struct uio *uio);
+int fuse_internal_invalidate_inode(struct mount *mp, struct uio *uio);
+
+/* mknod */
+int fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp,
+ struct componentname *cnp, struct vattr *vap);
+
+/* readdir */
struct pseudo_dirent {
uint32_t d_namlen;
};
+int fuse_internal_readdir(struct vnode *vp, struct uio *uio, off_t startoff,
+ struct fuse_filehandle *fufh, struct fuse_iov *cookediov, int *ncookies,
+ u_long *cookies);
+int fuse_internal_readdir_processdata(struct uio *uio, off_t startoff,
+ int *fnd_start, size_t reqsize, void *buf, size_t bufsize,
+ struct fuse_iov *cookediov, int *ncookies, u_long **cookiesp);
-int fuse_internal_readdir(struct vnode *vp, struct uio *uio,
- struct fuse_filehandle *fufh, struct fuse_iov *cookediov);
-int fuse_internal_readdir_processdata(struct uio *uio, size_t reqsize,
- void *buf, size_t bufsize, void *param);
-
/* remove */
int fuse_internal_remove(struct vnode *dvp, struct vnode *vp,
@@ -227,6 +270,10 @@
void fuse_internal_vnode_disappear(struct vnode *vp);
+/* setattr */
+int fuse_internal_setattr(struct vnode *vp, struct vattr *va,
+ struct thread *td, struct ucred *cred);
+
/* strategy */
/* entity creation */
@@ -270,5 +317,9 @@
int fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio);
void fuse_internal_send_init(struct fuse_data *data, struct thread *td);
+
+/* module load/unload */
+void fuse_internal_init(void);
+void fuse_internal_destroy(void);
#endif /* _FUSE_INTERNAL_H_ */
Index: sys/fs/fuse/fuse_internal.c
===================================================================
--- sys/fs/fuse/fuse_internal.c
+++ sys/fs/fuse/fuse_internal.c
@@ -33,6 +33,11 @@
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
*
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -59,8 +64,9 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
-#include <sys/module.h>
#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/module.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/conf.h>
@@ -89,35 +95,78 @@
#include "fuse.h"
#include "fuse_file.h"
#include "fuse_internal.h"
+#include "fuse_io.h"
#include "fuse_ipc.h"
#include "fuse_node.h"
#include "fuse_file.h"
-#include "fuse_param.h"
-SDT_PROVIDER_DECLARE(fuse);
+SDT_PROVIDER_DECLARE(fusefs);
/*
* Fuse trace probe:
* arg0: verbosity. Higher numbers give more verbose messages
* arg1: Textual message
*/
-SDT_PROBE_DEFINE2(fuse, , internal, trace, "int", "char*");
+SDT_PROBE_DEFINE2(fusefs, , internal, trace, "int", "char*");
#ifdef ZERO_PAD_INCOMPLETE_BUFS
static int isbzero(void *buf, size_t len);
#endif
-/* access */
+counter_u64_t fuse_lookup_cache_hits;
+counter_u64_t fuse_lookup_cache_misses;
+SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, lookup_cache_hits, CTLFLAG_RD,
+ &fuse_lookup_cache_hits, "number of positive cache hits in lookup");
+
+SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, lookup_cache_misses, CTLFLAG_RD,
+ &fuse_lookup_cache_misses, "number of cache misses in lookup");
+
int
+fuse_internal_get_cached_vnode(struct mount* mp, ino_t ino, int flags,
+ struct vnode **vpp)
+{
+ struct bintime now;
+ struct thread *td = curthread;
+ uint64_t nodeid = ino;
+ int error;
+
+ *vpp = NULL;
+
+ error = vfs_hash_get(mp, fuse_vnode_hash(nodeid), flags, td, vpp,
+ fuse_vnode_cmp, &nodeid);
+ if (error)
+ return error;
+ /*
+ * Check the entry cache timeout. We have to do this within fusefs
+ * instead of by using cache_enter_time/cache_lookup because those
+ * routines are only intended to work with pathnames, not inodes
+ */
+ if (*vpp != NULL) {
+ getbinuptime(&now);
+ if (bintime_cmp(&(VTOFUD(*vpp)->entry_cache_timeout), &now, >)){
+ counter_u64_add(fuse_lookup_cache_hits, 1);
+ return 0;
+ } else {
+ /* Entry cache timeout */
+ counter_u64_add(fuse_lookup_cache_misses, 1);
+ cache_purge(*vpp);
+ vput(*vpp);
+ *vpp = NULL;
+ }
+ }
+ return 0;
+}
+
+/* Synchronously send a FUSE_ACCESS operation */
+int
fuse_internal_access(struct vnode *vp,
- mode_t mode,
- struct fuse_access_param *facp,
+ accmode_t mode,
struct thread *td,
struct ucred *cred)
{
int err = 0;
- uint32_t mask = 0;
+ uint32_t mask = F_OK;
int dataflags;
int vtype;
struct mount *mp;
@@ -125,77 +174,57 @@
struct fuse_access_in *fai;
struct fuse_data *data;
- /* NOT YET DONE */
- /*
- * If this vnop gives you trouble, just return 0 here for a lazy
- * kludge.
- */
- /* return 0;*/
-
mp = vnode_mount(vp);
vtype = vnode_vtype(vp);
data = fuse_get_mpdata(mp);
dataflags = data->dataflags;
- if ((mode & VWRITE) && vfs_isrdonly(mp)) {
- return EACCES;
- }
- /* Unless explicitly permitted, deny everyone except the fs owner. */
- if (vnode_isvroot(vp) && !(facp->facc_flags & FACCESS_NOCHECKSPY)) {
- if (!(dataflags & FSESS_DAEMON_CAN_SPY)) {
- int denied = fuse_match_cred(data->daemoncred,
- cred);
+ if (mode == 0)
+ return 0;
- if (denied) {
- return EPERM;
- }
+ if (mode & VMODIFY_PERMS && vfs_isrdonly(mp)) {
+ switch (vp->v_type) {
+ case VDIR:
+ /* FALLTHROUGH */
+ case VLNK:
+ /* FALLTHROUGH */
+ case VREG:
+ return EROFS;
+ default:
+ break;
}
- facp->facc_flags |= FACCESS_NOCHECKSPY;
}
- if (!(facp->facc_flags & FACCESS_DO_ACCESS)) {
- return 0;
+
+ /* Unless explicitly permitted, deny everyone except the fs owner. */
+ if (!(dataflags & FSESS_DAEMON_CAN_SPY)) {
+ if (fuse_match_cred(data->daemoncred, cred))
+ return EPERM;
}
- if (((vtype == VREG) && (mode & VEXEC))) {
-#ifdef NEED_MOUNT_ARGUMENT_FOR_THIS
- /* Let the kernel handle this through open / close heuristics.*/
- return ENOTSUP;
-#else
- /* Let the kernel handle this. */
- return 0;
-#endif
- }
- if (!fsess_isimpl(mp, FUSE_ACCESS)) {
- /* Let the kernel handle this. */
- return 0;
- }
+
if (dataflags & FSESS_DEFAULT_PERMISSIONS) {
- /* Let the kernel handle this. */
- return 0;
+ struct vattr va;
+
+ fuse_internal_getattr(vp, &va, cred, td);
+ return vaccess(vp->v_type, va.va_mode, va.va_uid,
+ va.va_gid, mode, cred, NULL);
}
- if ((mode & VADMIN) != 0) {
- err = priv_check_cred(cred, PRIV_VFS_ADMIN);
- if (err) {
- return err;
- }
- }
- if ((mode & (VWRITE | VAPPEND | VADMIN)) != 0) {
+
+ if (!fsess_isimpl(mp, FUSE_ACCESS))
+ return 0;
+
+ if ((mode & (VWRITE | VAPPEND | VADMIN)) != 0)
mask |= W_OK;
- }
- if ((mode & VREAD) != 0) {
+ if ((mode & VREAD) != 0)
mask |= R_OK;
- }
- if ((mode & VEXEC) != 0) {
+ if ((mode & VEXEC) != 0)
mask |= X_OK;
- }
- bzero(&fdi, sizeof(fdi));
fdisp_init(&fdi, sizeof(*fai));
fdisp_make_vp(&fdi, FUSE_ACCESS, vp, td, cred);
fai = fdi.indata;
- fai->mask = F_OK;
- fai->mask |= mask;
+ fai->mask = mask;
err = fdisp_wait_answ(&fdi);
fdisp_destroy(&fdi);
@@ -208,9 +237,9 @@
}
/*
- * Cache FUSE attributes from feo, in attr cache associated with vnode 'vp'.
- * Optionally, if argument 'vap' is not NULL, store a copy of the converted
- * attributes there as well.
+ * Cache FUSE attributes from attr, in attribute cache associated with vnode
+ * 'vp'. Optionally, if argument 'vap' is not NULL, store a copy of the
+ * converted attributes there as well.
*
* If the nominal attribute cache TTL is zero, do not cache on the 'vp' (but do
* return the result to the caller).
@@ -221,49 +250,57 @@
{
struct mount *mp;
struct fuse_vnode_data *fvdat;
+ struct fuse_data *data;
struct vattr *vp_cache_at;
mp = vnode_mount(vp);
fvdat = VTOFUD(vp);
+ data = fuse_get_mpdata(mp);
- /* Honor explicit do-not-cache requests from user filesystems. */
- if (attr_valid == 0 && attr_valid_nsec == 0)
- fvdat->valid_attr_cache = false;
- else
- fvdat->valid_attr_cache = true;
+ ASSERT_VOP_ELOCKED(vp, "fuse_internal_cache_attrs");
- vp_cache_at = VTOVA(vp);
+ fuse_validity_2_bintime(attr_valid, attr_valid_nsec,
+ &fvdat->attr_cache_timeout);
- if (vap == NULL && vp_cache_at == NULL)
+ /* Fix our buffers if the filesize changed without us knowing */
+ if (vnode_isreg(vp) && attr->size != fvdat->cached_attrs.va_size) {
+ (void)fuse_vnode_setsize(vp, attr->size);
+ fvdat->cached_attrs.va_size = attr->size;
+ }
+
+ if (attr_valid > 0 || attr_valid_nsec > 0)
+ vp_cache_at = &(fvdat->cached_attrs);
+ else if (vap != NULL)
+ vp_cache_at = vap;
+ else
return;
- if (vap == NULL)
- vap = vp_cache_at;
-
- vattr_null(vap);
-
- vap->va_fsid = mp->mnt_stat.f_fsid.val[0];
- vap->va_fileid = attr->ino;
- vap->va_mode = attr->mode & ~S_IFMT;
- vap->va_nlink = attr->nlink;
- vap->va_uid = attr->uid;
- vap->va_gid = attr->gid;
- vap->va_rdev = attr->rdev;
- vap->va_size = attr->size;
+ vattr_null(vp_cache_at);
+ vp_cache_at->va_fsid = mp->mnt_stat.f_fsid.val[0];
+ vp_cache_at->va_fileid = attr->ino;
+ vp_cache_at->va_mode = attr->mode & ~S_IFMT;
+ vp_cache_at->va_nlink = attr->nlink;
+ vp_cache_at->va_uid = attr->uid;
+ vp_cache_at->va_gid = attr->gid;
+ vp_cache_at->va_rdev = attr->rdev;
+ vp_cache_at->va_size = attr->size;
/* XXX on i386, seconds are truncated to 32 bits */
- vap->va_atime.tv_sec = attr->atime;
- vap->va_atime.tv_nsec = attr->atimensec;
- vap->va_mtime.tv_sec = attr->mtime;
- vap->va_mtime.tv_nsec = attr->mtimensec;
- vap->va_ctime.tv_sec = attr->ctime;
- vap->va_ctime.tv_nsec = attr->ctimensec;
- vap->va_blocksize = PAGE_SIZE;
- vap->va_type = IFTOVT(attr->mode);
- vap->va_bytes = attr->blocks * S_BLKSIZE;
- vap->va_flags = 0;
+ vp_cache_at->va_atime.tv_sec = attr->atime;
+ vp_cache_at->va_atime.tv_nsec = attr->atimensec;
+ vp_cache_at->va_mtime.tv_sec = attr->mtime;
+ vp_cache_at->va_mtime.tv_nsec = attr->mtimensec;
+ vp_cache_at->va_ctime.tv_sec = attr->ctime;
+ vp_cache_at->va_ctime.tv_nsec = attr->ctimensec;
+ if (fuse_libabi_geq(data, 7, 9) && attr->blksize > 0)
+ vp_cache_at->va_blocksize = attr->blksize;
+ else
+ vp_cache_at->va_blocksize = PAGE_SIZE;
+ vp_cache_at->va_type = IFTOVT(attr->mode);
+ vp_cache_at->va_bytes = attr->blocks * S_BLKSIZE;
+ vp_cache_at->va_flags = 0;
- if (vap != vp_cache_at && vp_cache_at != NULL)
- memcpy(vp_cache_at, vap, sizeof(*vap));
+ if (vap != vp_cache_at && vap != NULL)
+ memcpy(vap, vp_cache_at, sizeof(*vap));
}
@@ -281,47 +318,195 @@
int
fuse_internal_fsync(struct vnode *vp,
struct thread *td,
- struct ucred *cred,
- struct fuse_filehandle *fufh)
+ int waitfor,
+ bool datasync)
{
- int op = FUSE_FSYNC;
- struct fuse_fsync_in *ffsi;
+ struct fuse_fsync_in *ffsi = NULL;
struct fuse_dispatcher fdi;
+ struct fuse_filehandle *fufh;
+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
+ struct mount *mp = vnode_mount(vp);
+ int op = FUSE_FSYNC;
+ int err = 0;
- if (vnode_isdir(vp)) {
- op = FUSE_FSYNCDIR;
+ if (!fsess_isimpl(vnode_mount(vp),
+ (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) {
+ return 0;
}
- fdisp_init(&fdi, sizeof(*ffsi));
- fdisp_make_vp(&fdi, op, vp, td, cred);
- ffsi = fdi.indata;
- ffsi->fh = fufh->fh_id;
+ if (vnode_isdir(vp))
+ op = FUSE_FSYNCDIR;
- ffsi->fsync_flags = 1; /* datasync */
+ if (!fsess_isimpl(mp, op))
+ return 0;
- fuse_insert_callback(fdi.tick, fuse_internal_fsync_callback);
- fuse_insert_message(fdi.tick);
+ fdisp_init(&fdi, sizeof(*ffsi));
+ /*
+ * fsync every open file handle for this file, because we can't be sure
+ * which file handle the caller is really referring to.
+ */
+ LIST_FOREACH(fufh, &fvdat->handles, next) {
+ if (ffsi == NULL)
+ fdisp_make_vp(&fdi, op, vp, td, NULL);
+ else
+ fdisp_refresh_vp(&fdi, op, vp, td, NULL);
+ ffsi = fdi.indata;
+ ffsi->fh = fufh->fh_id;
+ ffsi->fsync_flags = 0;
+ if (datasync)
+ ffsi->fsync_flags = 1;
+
+ if (waitfor == MNT_WAIT) {
+ err = fdisp_wait_answ(&fdi);
+ } else {
+ fuse_insert_callback(fdi.tick,
+ fuse_internal_fsync_callback);
+ fuse_insert_message(fdi.tick, false);
+ }
+ if (err == ENOSYS) {
+ /* ENOSYS means "success, and don't call again" */
+ fsess_set_notimpl(mp, op);
+ err = 0;
+ break;
+ }
+ }
fdisp_destroy(&fdi);
- return 0;
+ return err;
+}
+/* Asynchronous invalidation */
+SDT_PROBE_DEFINE2(fusefs, , internal, invalidate_cache_hit,
+ "struct vnode*", "struct vnode*");
+int
+fuse_internal_invalidate_entry(struct mount *mp, struct uio *uio)
+{
+ struct fuse_notify_inval_entry_out fnieo;
+ struct componentname cn;
+ struct vnode *dvp, *vp;
+ char name[PATH_MAX];
+ int err;
+
+ if ((err = uiomove(&fnieo, sizeof(fnieo), uio)) != 0)
+ return (err);
+
+ if ((err = uiomove(name, fnieo.namelen, uio)) != 0)
+ return (err);
+ name[fnieo.namelen] = '\0';
+ /* fusefs does not cache "." or ".." entries */
+ if (strncmp(name, ".", sizeof(".")) == 0 ||
+ strncmp(name, "..", sizeof("..")) == 0)
+ return (0);
+
+ if (fnieo.parent == FUSE_ROOT_ID)
+ err = VFS_ROOT(mp, LK_SHARED, &dvp);
+ else
+ err = fuse_internal_get_cached_vnode( mp, fnieo.parent,
+ LK_SHARED, &dvp);
+ /*
+ * If dvp is not in the cache, then it must've been reclaimed. And
+ * since fuse_vnop_reclaim does a cache_purge, name's entry must've
+ * been invalidated already. So we can safely return if dvp == NULL
+ */
+ if (err != 0 || dvp == NULL)
+ return (err);
+ /*
+ * XXX we can't check dvp's generation because the FUSE invalidate
+ * entry message doesn't include it. Worse case is that we invalidate
+ * an entry that didn't need to be invalidated.
+ */
+
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = 0; /* !MAKEENTRY means free cached entry */
+ cn.cn_thread = curthread;
+ cn.cn_cred = curthread->td_ucred;
+ cn.cn_lkflags = LK_SHARED;
+ cn.cn_pnbuf = NULL;
+ cn.cn_nameptr = name;
+ cn.cn_namelen = fnieo.namelen;
+ err = cache_lookup(dvp, &vp, &cn, NULL, NULL);
+ MPASS(err == 0);
+ fuse_vnode_clear_attr_cache(dvp);
+ vput(dvp);
+ return (0);
}
+int
+fuse_internal_invalidate_inode(struct mount *mp, struct uio *uio)
+{
+ struct fuse_notify_inval_inode_out fniio;
+ struct vnode *vp;
+ int err;
+
+ if ((err = uiomove(&fniio, sizeof(fniio), uio)) != 0)
+ return (err);
+
+ if (fniio.ino == FUSE_ROOT_ID)
+ err = VFS_ROOT(mp, LK_EXCLUSIVE, &vp);
+ else
+ err = fuse_internal_get_cached_vnode(mp, fniio.ino, LK_SHARED,
+ &vp);
+ if (err != 0 || vp == NULL)
+ return (err);
+ /*
+ * XXX we can't check vp's generation because the FUSE invalidate
+ * entry message doesn't include it. Worse case is that we invalidate
+ * an inode that didn't need to be invalidated.
+ */
+
+ /*
+ * Flush and invalidate buffers if off >= 0. Technically we only need
+ * to flush and invalidate the range of offsets [off, off + len), but
+ * for simplicity's sake we do everything.
+ */
+ if (fniio.off >= 0)
+ fuse_io_invalbuf(vp, curthread);
+ fuse_vnode_clear_attr_cache(vp);
+ vput(vp);
+ return (0);
+}
+
+/* mknod */
+int
+fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp,
+ struct componentname *cnp, struct vattr *vap)
+{
+ struct fuse_data *data;
+ struct fuse_mknod_in fmni;
+ size_t insize;
+
+ data = fuse_get_mpdata(dvp->v_mount);
+
+ fmni.mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ fmni.rdev = vap->va_rdev;
+ if (fuse_libabi_geq(data, 7, 12)) {
+ insize = sizeof(fmni);
+ fmni.umask = curthread->td_proc->p_fd->fd_cmask;
+ } else {
+ insize = FUSE_COMPAT_MKNOD_IN_SIZE;
+ }
+ return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKNOD, &fmni,
+ insize, vap->va_type));
+}
+
/* readdir */
int
fuse_internal_readdir(struct vnode *vp,
struct uio *uio,
+ off_t startoff,
struct fuse_filehandle *fufh,
- struct fuse_iov *cookediov)
+ struct fuse_iov *cookediov,
+ int *ncookies,
+ u_long *cookies)
{
int err = 0;
struct fuse_dispatcher fdi;
- struct fuse_read_in *fri;
+ struct fuse_read_in *fri = NULL;
+ int fnd_start;
- if (uio_resid(uio) == 0) {
+ if (uio_resid(uio) == 0)
return 0;
- }
fdisp_init(&fdi, 0);
/*
@@ -329,51 +514,70 @@
* I/O).
*/
+ /*
+ * fnd_start is set non-zero once the offset in the directory gets
+ * to the startoff. This is done because directories must be read
+ * from the beginning (offset == 0) when fuse_vnop_readdir() needs
+ * to do an open of the directory.
+ * If it is not set non-zero here, it will be set non-zero in
+ * fuse_internal_readdir_processdata() when uio_offset == startoff.
+ */
+ fnd_start = 0;
+ if (uio->uio_offset == startoff)
+ fnd_start = 1;
while (uio_resid(uio) > 0) {
-
fdi.iosize = sizeof(*fri);
- fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
+ if (fri == NULL)
+ fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
+ else
+ fdisp_refresh_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
fri = fdi.indata;
fri->fh = fufh->fh_id;
fri->offset = uio_offset(uio);
- fri->size = min(uio_resid(uio), FUSE_DEFAULT_IOSIZE);
- /* mp->max_read */
+ fri->size = MIN(uio->uio_resid,
+ fuse_get_mpdata(vp->v_mount)->max_read);
- if ((err = fdisp_wait_answ(&fdi))) {
+ if ((err = fdisp_wait_answ(&fdi)))
break;
- }
- if ((err = fuse_internal_readdir_processdata(uio, fri->size, fdi.answ,
- fdi.iosize, cookediov))) {
+ if ((err = fuse_internal_readdir_processdata(uio, startoff,
+ &fnd_start, fri->size, fdi.answ, fdi.iosize, cookediov,
+ ncookies, &cookies)))
break;
- }
}
fdisp_destroy(&fdi);
return ((err == -1) ? 0 : err);
}
+/*
+ * Return -1 to indicate that this readdir is finished, 0 if it copied
+ * all the directory data read in and it may be possible to read more
+ * and greater than 0 for a failure.
+ */
int
fuse_internal_readdir_processdata(struct uio *uio,
+ off_t startoff,
+ int *fnd_start,
size_t reqsize,
void *buf,
size_t bufsize,
- void *param)
+ struct fuse_iov *cookediov,
+ int *ncookies,
+ u_long **cookiesp)
{
int err = 0;
- int cou = 0;
int bytesavail;
size_t freclen;
struct dirent *de;
struct fuse_dirent *fudge;
- struct fuse_iov *cookediov = param;
+ u_long *cookies;
- if (bufsize < FUSE_NAME_OFFSET) {
+ cookies = *cookiesp;
+ if (bufsize < FUSE_NAME_OFFSET)
return -1;
- }
for (;;) {
-
if (bufsize < FUSE_NAME_OFFSET) {
err = -1;
break;
@@ -381,10 +585,12 @@
fudge = (struct fuse_dirent *)buf;
freclen = FUSE_DIRENT_SIZE(fudge);
- cou++;
-
if (bufsize < freclen) {
- err = ((cou == 1) ? -1 : 0);
+ /*
+ * This indicates a partial directory entry at the
+ * end of the directory data.
+ */
+ err = -1;
break;
}
#ifdef ZERO_PAD_INCOMPLETE_BUFS
@@ -402,30 +608,47 @@
&fudge->namelen);
if (bytesavail > uio_resid(uio)) {
+ /* Out of space for the dir so we are done. */
err = -1;
break;
}
- fiov_refresh(cookediov);
- fiov_adjust(cookediov, bytesavail);
+ /*
+ * Don't start to copy the directory entries out until
+ * the requested offset in the directory is found.
+ */
+ if (*fnd_start != 0) {
+ fiov_adjust(cookediov, bytesavail);
+ bzero(cookediov->base, bytesavail);
- de = (struct dirent *)cookediov->base;
- de->d_fileno = fudge->ino;
- de->d_reclen = bytesavail;
- de->d_type = fudge->type;
- de->d_namlen = fudge->namelen;
- memcpy((char *)cookediov->base + sizeof(struct dirent) -
- MAXNAMLEN - 1,
- (char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
- dirent_terminate(de);
+ de = (struct dirent *)cookediov->base;
+ de->d_fileno = fudge->ino;
+ de->d_reclen = bytesavail;
+ de->d_type = fudge->type;
+ de->d_namlen = fudge->namelen;
+ memcpy((char *)cookediov->base + sizeof(struct dirent) -
+ MAXNAMLEN - 1,
+ (char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
+ dirent_terminate(de);
- err = uiomove(cookediov->base, cookediov->len, uio);
- if (err) {
- break;
- }
+ err = uiomove(cookediov->base, cookediov->len, uio);
+ if (err)
+ break;
+ if (cookies != NULL) {
+ if (*ncookies == 0) {
+ err = -1;
+ break;
+ }
+ *cookies = fudge->off;
+ cookies++;
+ (*ncookies)--;
+ }
+ } else if (startoff == fudge->off)
+ *fnd_start = 1;
buf = (char *)buf + freclen;
bufsize -= freclen;
uio_setoffset(uio, fudge->off);
}
+ *cookiesp = cookies;
return err;
}
@@ -439,12 +662,9 @@
enum fuse_opcode op)
{
struct fuse_dispatcher fdi;
- struct fuse_vnode_data *fvdat;
- int err;
+ nlink_t nlink;
+ int err = 0;
- err = 0;
- fvdat = VTOFUD(vp);
-
fdisp_init(&fdi, cnp->cn_namelen + 1);
fdisp_make_vp(&fdi, op, dvp, cnp->cn_thread, cnp->cn_cred);
@@ -453,6 +673,35 @@
err = fdisp_wait_answ(&fdi);
fdisp_destroy(&fdi);
+
+ if (err)
+ return (err);
+
+ /*
+ * Access the cached nlink even if the attr cached has expired. If
+ * it's inaccurate, the worst that will happen is:
+ * 1) We'll recycle the vnode even though the file has another link we
+ * don't know about, costing a bit of cpu time, or
+ * 2) We won't recycle the vnode even though all of its links are gone.
+ * It will linger around until vnlru reclaims it, costing a bit of
+ * temporary memory.
+ */
+ nlink = VTOFUD(vp)->cached_attrs.va_nlink--;
+
+ /*
+ * Purge the parent's attribute cache because the daemon
+ * should've updated its mtime and ctime.
+ */
+ fuse_vnode_clear_attr_cache(dvp);
+
+ /* NB: nlink could be zero if it was never cached */
+ if (nlink <= 1 || vnode_vtype(vp) == VDIR) {
+ fuse_internal_vnode_disappear(vp);
+ } else {
+ cache_purge(vp);
+ fuse_vnode_update(vp, FN_CTIMECHANGE);
+ }
+
return err;
}
@@ -532,6 +781,13 @@
feo->nodeid, 1);
return err;
}
+
+ /*
+ * Purge the parent's attribute cache because the daemon should've
+ * updated its mtime and ctime
+ */
+ fuse_vnode_clear_attr_cache(dvp);
+
fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
feo->attr_valid_nsec, NULL);
@@ -593,10 +849,79 @@
ffi = fdi.indata;
ffi->nlookup = nlookup;
- fuse_insert_message(fdi.tick);
+ fuse_insert_message(fdi.tick, false);
fdisp_destroy(&fdi);
}
+/* Fetch the vnode's attributes from the daemon*/
+int
+fuse_internal_do_getattr(struct vnode *vp, struct vattr *vap,
+ struct ucred *cred, struct thread *td)
+{
+ struct fuse_dispatcher fdi;
+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
+ struct fuse_getattr_in *fgai;
+ struct fuse_attr_out *fao;
+ off_t old_filesize = fvdat->cached_attrs.va_size;
+ struct timespec old_ctime = fvdat->cached_attrs.va_ctime;
+ struct timespec old_mtime = fvdat->cached_attrs.va_mtime;
+ enum vtype vtyp;
+ int err;
+
+ fdisp_init(&fdi, 0);
+ fdisp_make_vp(&fdi, FUSE_GETATTR, vp, td, cred);
+ fgai = fdi.indata;
+ /*
+ * We could look up a file handle and set it in fgai->fh, but that
+ * involves extra runtime work and I'm unaware of any file systems that
+ * care.
+ */
+ fgai->getattr_flags = 0;
+ if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) {
+ if (err == ENOENT)
+ fuse_internal_vnode_disappear(vp);
+ goto out;
+ }
+
+ fao = (struct fuse_attr_out *)fdi.answ;
+ vtyp = IFTOVT(fao->attr.mode);
+ if (fvdat->flag & FN_SIZECHANGE)
+ fao->attr.size = old_filesize;
+ if (fvdat->flag & FN_CTIMECHANGE) {
+ fao->attr.ctime = old_ctime.tv_sec;
+ fao->attr.ctimensec = old_ctime.tv_nsec;
+ }
+ if (fvdat->flag & FN_MTIMECHANGE) {
+ fao->attr.mtime = old_mtime.tv_sec;
+ fao->attr.mtimensec = old_mtime.tv_nsec;
+ }
+ fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
+ fao->attr_valid_nsec, vap);
+ if (vtyp != vnode_vtype(vp)) {
+ fuse_internal_vnode_disappear(vp);
+ err = ENOENT;
+ }
+
+out:
+ fdisp_destroy(&fdi);
+ return err;
+}
+
+/* Read a vnode's attributes from cache or fetch them from the fuse daemon */
+int
+fuse_internal_getattr(struct vnode *vp, struct vattr *vap, struct ucred *cred,
+ struct thread *td)
+{
+ struct vattr *attrs;
+
+ if ((attrs = VTOVA(vp)) != NULL) {
+ *vap = *attrs; /* struct copy */
+ return 0;
+ }
+
+ return fuse_internal_do_getattr(vp, vap, cred, td);
+}
+
void
fuse_internal_vnode_disappear(struct vnode *vp)
{
@@ -604,7 +929,6 @@
ASSERT_VOP_ELOCKED(vp, "fuse_internal_vnode_disappear");
fvdat->flag |= FN_REVOKED;
- fvdat->valid_attr_cache = false;
cache_purge(vp);
}
@@ -625,27 +949,69 @@
}
fiio = fticket_resp(tick)->base;
- /* XXX: Do we want to check anything further besides this? */
- if (fiio->major < 7) {
- SDT_PROBE2(fuse, , internal, trace, 1,
+ data->fuse_libabi_major = fiio->major;
+ data->fuse_libabi_minor = fiio->minor;
+ if (!fuse_libabi_geq(data, 7, 4)) {
+ /*
+ * With a little work we could support servers as old as 7.1.
+ * But there would be little payoff.
+ */
+ SDT_PROBE2(fusefs, , internal, trace, 1,
"userpace version too low");
err = EPROTONOSUPPORT;
goto out;
}
- data->fuse_libabi_major = fiio->major;
- data->fuse_libabi_minor = fiio->minor;
if (fuse_libabi_geq(data, 7, 5)) {
- if (fticket_resp(tick)->len == sizeof(struct fuse_init_out)) {
+ if (fticket_resp(tick)->len == sizeof(struct fuse_init_out) ||
+ fticket_resp(tick)->len == FUSE_COMPAT_22_INIT_OUT_SIZE) {
data->max_write = fiio->max_write;
+ if (fiio->flags & FUSE_ASYNC_READ)
+ data->dataflags |= FSESS_ASYNC_READ;
+ if (fiio->flags & FUSE_POSIX_LOCKS)
+ data->dataflags |= FSESS_POSIX_LOCKS;
+ if (fiio->flags & FUSE_EXPORT_SUPPORT)
+ data->dataflags |= FSESS_EXPORT_SUPPORT;
+ /*
+ * Don't bother to check FUSE_BIG_WRITES, because it's
+ * redundant with max_write
+ */
+ /*
+ * max_background and congestion_threshold are not
+ * implemented
+ */
} else {
err = EINVAL;
}
} else {
- /* Old fix values */
+ /* Old fixed values */
data->max_write = 4096;
}
+ if (fuse_libabi_geq(data, 7, 6))
+ data->max_readahead_blocks = fiio->max_readahead / maxbcachebuf;
+
+ if (!fuse_libabi_geq(data, 7, 7))
+ fsess_set_notimpl(data->mp, FUSE_INTERRUPT);
+
+ if (!fuse_libabi_geq(data, 7, 8)) {
+ fsess_set_notimpl(data->mp, FUSE_BMAP);
+ fsess_set_notimpl(data->mp, FUSE_DESTROY);
+ }
+
+ if (fuse_libabi_geq(data, 7, 23) && fiio->time_gran >= 1 &&
+ fiio->time_gran <= 1000000000)
+ data->time_gran = fiio->time_gran;
+ else
+ data->time_gran = 1;
+
+ if (!fuse_libabi_geq(data, 7, 23))
+ data->cache_mode = fuse_data_cache_mode;
+ else if (fiio->flags & FUSE_WRITEBACK_CACHE)
+ data->cache_mode = FUSE_CACHE_WB;
+ else
+ data->cache_mode = FUSE_CACHE_WT;
+
out:
if (err) {
fdata_set_dead(data);
@@ -669,14 +1035,156 @@
fiii = fdi.indata;
fiii->major = FUSE_KERNEL_VERSION;
fiii->minor = FUSE_KERNEL_MINOR_VERSION;
- fiii->max_readahead = FUSE_DEFAULT_IOSIZE * 16;
- fiii->flags = 0;
+ /*
+ * fusefs currently reads ahead no more than one cache block at a time.
+ * See fuse_read_biobackend
+ */
+ fiii->max_readahead = maxbcachebuf;
+ /*
+ * Unsupported features:
+ * FUSE_FILE_OPS: No known FUSE server or client supports it
+ * FUSE_ATOMIC_O_TRUNC: our VFS cannot support it
+ * FUSE_DONT_MASK: unlike Linux, FreeBSD always applies the umask, even
+ * when default ACLs are in use.
+ * FUSE_SPLICE_WRITE, FUSE_SPLICE_MOVE, FUSE_SPLICE_READ: FreeBSD
+ * doesn't have splice(2).
+ * FUSE_FLOCK_LOCKS: not yet implemented
+ * FUSE_HAS_IOCTL_DIR: not yet implemented
+ * FUSE_AUTO_INVAL_DATA: not yet implemented
+ * FUSE_DO_READDIRPLUS: not yet implemented
+ * FUSE_READDIRPLUS_AUTO: not yet implemented
+ * FUSE_ASYNC_DIO: not yet implemented
+ * FUSE_NO_OPEN_SUPPORT: not yet implemented
+ */
+ fiii->flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_EXPORT_SUPPORT
+ | FUSE_BIG_WRITES | FUSE_WRITEBACK_CACHE;
fuse_insert_callback(fdi.tick, fuse_internal_init_callback);
- fuse_insert_message(fdi.tick);
+ fuse_insert_message(fdi.tick, false);
fdisp_destroy(&fdi);
}
+/*
+ * Send a FUSE_SETATTR operation with no permissions checks. If cred is NULL,
+ * send the request with root credentials
+ */
+int fuse_internal_setattr(struct vnode *vp, struct vattr *vap,
+ struct thread *td, struct ucred *cred)
+{
+ struct fuse_vnode_data *fvdat;
+ struct fuse_dispatcher fdi;
+ struct fuse_setattr_in *fsai;
+ struct mount *mp;
+ pid_t pid = td->td_proc->p_pid;
+ struct fuse_data *data;
+ int dataflags;
+ int err = 0;
+ enum vtype vtyp;
+ int sizechanged = -1;
+ uint64_t newsize = 0;
+
+ mp = vnode_mount(vp);
+ fvdat = VTOFUD(vp);
+ data = fuse_get_mpdata(mp);
+ dataflags = data->dataflags;
+
+ fdisp_init(&fdi, sizeof(*fsai));
+ fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred);
+ if (!cred) {
+ fdi.finh->uid = 0;
+ fdi.finh->gid = 0;
+ }
+ fsai = fdi.indata;
+ fsai->valid = 0;
+
+ if (vap->va_uid != (uid_t)VNOVAL) {
+ fsai->uid = vap->va_uid;
+ fsai->valid |= FATTR_UID;
+ }
+ if (vap->va_gid != (gid_t)VNOVAL) {
+ fsai->gid = vap->va_gid;
+ fsai->valid |= FATTR_GID;
+ }
+ if (vap->va_size != VNOVAL) {
+ struct fuse_filehandle *fufh = NULL;
+
+ /*Truncate to a new value. */
+ fsai->size = vap->va_size;
+ sizechanged = 1;
+ newsize = vap->va_size;
+ fsai->valid |= FATTR_SIZE;
+
+ fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid);
+ if (fufh) {
+ fsai->fh = fufh->fh_id;
+ fsai->valid |= FATTR_FH;
+ }
+ VTOFUD(vp)->flag &= ~FN_SIZECHANGE;
+ }
+ if (vap->va_atime.tv_sec != VNOVAL) {
+ fsai->atime = vap->va_atime.tv_sec;
+ fsai->atimensec = vap->va_atime.tv_nsec;
+ fsai->valid |= FATTR_ATIME;
+ if (vap->va_vaflags & VA_UTIMES_NULL)
+ fsai->valid |= FATTR_ATIME_NOW;
+ }
+ if (vap->va_mtime.tv_sec != VNOVAL) {
+ fsai->mtime = vap->va_mtime.tv_sec;
+ fsai->mtimensec = vap->va_mtime.tv_nsec;
+ fsai->valid |= FATTR_MTIME;
+ if (vap->va_vaflags & VA_UTIMES_NULL)
+ fsai->valid |= FATTR_MTIME_NOW;
+ } else if (fvdat->flag & FN_MTIMECHANGE) {
+ fsai->mtime = fvdat->cached_attrs.va_mtime.tv_sec;
+ fsai->mtimensec = fvdat->cached_attrs.va_mtime.tv_nsec;
+ fsai->valid |= FATTR_MTIME;
+ }
+ if (fuse_libabi_geq(data, 7, 23) && fvdat->flag & FN_CTIMECHANGE) {
+ fsai->ctime = fvdat->cached_attrs.va_ctime.tv_sec;
+ fsai->ctimensec = fvdat->cached_attrs.va_ctime.tv_nsec;
+ fsai->valid |= FATTR_CTIME;
+ }
+ if (vap->va_mode != (mode_t)VNOVAL) {
+ fsai->mode = vap->va_mode & ALLPERMS;
+ fsai->valid |= FATTR_MODE;
+ }
+ if (!fsai->valid) {
+ goto out;
+ }
+
+ if ((err = fdisp_wait_answ(&fdi)))
+ goto out;
+ vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode);
+
+ if (vnode_vtype(vp) != vtyp) {
+ if (vnode_vtype(vp) == VNON && vtyp != VNON) {
+ SDT_PROBE2(fusefs, , internal, trace, 1, "FUSE: Dang! "
+ "vnode_vtype is VNON and vtype isn't.");
+ } else {
+ /*
+ * STALE vnode, ditch
+ *
+ * The vnode has changed its type "behind our back".
+ * There's nothing really we can do, so let us just
+ * force an internal revocation and tell the caller to
+ * try again, if interested.
+ */
+ fuse_internal_vnode_disappear(vp);
+ err = EAGAIN;
+ }
+ }
+ if (err == 0) {
+ struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ;
+ fuse_vnode_undirty_cached_timestamps(vp);
+ fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
+ fao->attr_valid_nsec, NULL);
+ }
+
+out:
+ fdisp_destroy(&fdi);
+ return err;
+}
+
#ifdef ZERO_PAD_INCOMPLETE_BUFS
static int
isbzero(void *buf, size_t len)
@@ -692,3 +1200,19 @@
}
#endif
+
+void
+fuse_internal_init(void)
+{
+ fuse_lookup_cache_misses = counter_u64_alloc(M_WAITOK);
+ counter_u64_zero(fuse_lookup_cache_misses);
+ fuse_lookup_cache_hits = counter_u64_alloc(M_WAITOK);
+ counter_u64_zero(fuse_lookup_cache_hits);
+}
+
+void
+fuse_internal_destroy(void)
+{
+ counter_u64_free(fuse_lookup_cache_hits);
+ counter_u64_free(fuse_lookup_cache_misses);
+}
Index: sys/fs/fuse/fuse_io.h
===================================================================
--- sys/fs/fuse/fuse_io.h
+++ sys/fs/fuse/fuse_io.h
@@ -32,6 +32,11 @@
*
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -61,7 +66,7 @@
#define _FUSE_IO_H_
int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag,
- struct ucred *cred);
+ struct ucred *cred, pid_t pid);
int fuse_io_strategy(struct vnode *vp, struct buf *bp);
int fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td);
int fuse_io_invalbuf(struct vnode *vp, struct thread *td);
Index: sys/fs/fuse/fuse_io.c
===================================================================
--- sys/fs/fuse/fuse_io.c
+++ sys/fs/fuse/fuse_io.c
@@ -33,6 +33,11 @@
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
*
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -72,6 +77,7 @@
#include <sys/sx.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
@@ -83,6 +89,7 @@
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
@@ -98,45 +105,108 @@
#include "fuse_ipc.h"
#include "fuse_io.h"
-SDT_PROVIDER_DECLARE(fuse);
/*
+ * Set in a struct buf to indicate that the write came from the buffer cache
+ * and the originating cred and pid are no longer known.
+ */
+#define B_FUSEFS_WRITE_CACHE B_FS_FLAG1
+
+SDT_PROVIDER_DECLARE(fusefs);
+/*
* Fuse trace probe:
* arg0: verbosity. Higher numbers give more verbose messages
* arg1: Textual message
*/
-SDT_PROBE_DEFINE2(fuse, , io, trace, "int", "char*");
+SDT_PROBE_DEFINE2(fusefs, , io, trace, "int", "char*");
+static void
+fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred,
+ struct thread *td);
static int
fuse_read_directbackend(struct vnode *vp, struct uio *uio,
struct ucred *cred, struct fuse_filehandle *fufh);
static int
-fuse_read_biobackend(struct vnode *vp, struct uio *uio,
- struct ucred *cred, struct fuse_filehandle *fufh);
+fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
+ struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid);
static int
fuse_write_directbackend(struct vnode *vp, struct uio *uio,
- struct ucred *cred, struct fuse_filehandle *fufh, int ioflag);
+ struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize,
+ int ioflag, bool pages);
static int
fuse_write_biobackend(struct vnode *vp, struct uio *uio,
- struct ucred *cred, struct fuse_filehandle *fufh, int ioflag);
+ struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid);
-SDT_PROBE_DEFINE5(fuse, , io, io_dispatch, "struct vnode*", "struct uio*",
+/*
+ * FreeBSD clears the SUID and SGID bits on any write by a non-root user.
+ */
+static void
+fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred,
+ struct thread *td)
+{
+ struct fuse_data *data;
+ struct mount *mp;
+ struct vattr va;
+ int dataflags;
+
+ mp = vnode_mount(vp);
+ data = fuse_get_mpdata(mp);
+ dataflags = data->dataflags;
+
+ if (dataflags & FSESS_DEFAULT_PERMISSIONS) {
+ if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) {
+ fuse_internal_getattr(vp, &va, cred, td);
+ if (va.va_mode & (S_ISUID | S_ISGID)) {
+ mode_t mode = va.va_mode & ~(S_ISUID | S_ISGID);
+ /* Clear all vattr fields except mode */
+ vattr_null(&va);
+ va.va_mode = mode;
+
+ /*
+ * Ignore fuse_internal_setattr's return value,
+ * because at this point the write operation has
+ * already succeeded and we don't want to return
+ * failing status for that.
+ */
+ (void)fuse_internal_setattr(vp, &va, td, NULL);
+ }
+ }
+ }
+}
+
+SDT_PROBE_DEFINE5(fusefs, , io, io_dispatch, "struct vnode*", "struct uio*",
"int", "struct ucred*", "struct fuse_filehandle*");
+SDT_PROBE_DEFINE4(fusefs, , io, io_dispatch_filehandles_closed, "struct vnode*",
+ "struct uio*", "int", "struct ucred*");
int
fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag,
- struct ucred *cred)
+ struct ucred *cred, pid_t pid)
{
struct fuse_filehandle *fufh;
int err, directio;
+ int fflag;
+ bool closefufh = false;
MPASS(vp->v_type == VREG || vp->v_type == VDIR);
- err = fuse_filehandle_getrw(vp,
- (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh);
- if (err) {
+ fflag = (uio->uio_rw == UIO_READ) ? FREAD : FWRITE;
+ err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
+ if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) {
+ /*
+ * nfsd will do I/O without first doing VOP_OPEN. We
+ * must implicitly open the file here
+ */
+ err = fuse_filehandle_open(vp, fflag, &fufh, curthread, cred);
+ closefufh = true;
+ }
+ else if (err) {
+ SDT_PROBE4(fusefs, , io, io_dispatch_filehandles_closed,
+ vp, uio, ioflag, cred);
printf("FUSE: io dispatch: filehandles are closed\n");
return err;
}
- SDT_PROBE5(fuse, , io, io_dispatch, vp, uio, ioflag, cred, fufh);
+ if (err)
+ goto out;
+ SDT_PROBE5(fusefs, , io, io_dispatch, vp, uio, ioflag, cred, fufh);
/*
* Ideally, when the daemon asks for direct io at open time, the
@@ -153,108 +223,136 @@
switch (uio->uio_rw) {
case UIO_READ:
if (directio) {
- SDT_PROBE2(fuse, , io, trace, 1,
+ SDT_PROBE2(fusefs, , io, trace, 1,
"direct read of vnode");
err = fuse_read_directbackend(vp, uio, cred, fufh);
} else {
- SDT_PROBE2(fuse, , io, trace, 1,
+ SDT_PROBE2(fusefs, , io, trace, 1,
"buffered read of vnode");
- err = fuse_read_biobackend(vp, uio, cred, fufh);
+ err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh,
+ pid);
}
break;
case UIO_WRITE:
- /*
- * Kludge: simulate write-through caching via write-around
- * caching. Same effect, as far as never caching dirty data,
- * but slightly pessimal in that newly written data is not
- * cached.
- */
- if (directio || fuse_data_cache_mode == FUSE_CACHE_WT) {
- SDT_PROBE2(fuse, , io, trace, 1,
+ fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE);
+ if (directio) {
+ const int iosize = fuse_iosize(vp);
+ off_t start, end, filesize;
+
+ SDT_PROBE2(fusefs, , io, trace, 1,
"direct write of vnode");
- err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag);
+
+ err = fuse_vnode_size(vp, &filesize, cred, curthread);
+ if (err)
+ goto out;
+
+ start = uio->uio_offset;
+ end = start + uio->uio_resid;
+ KASSERT((ioflag & (IO_VMIO | IO_DIRECT)) !=
+ (IO_VMIO | IO_DIRECT),
+ ("IO_DIRECT used for a cache flush?"));
+ /* Invalidate the write cache when writing directly */
+ v_inval_buf_range(vp, start, end, iosize);
+ err = fuse_write_directbackend(vp, uio, cred, fufh,
+ filesize, ioflag, false);
} else {
- SDT_PROBE2(fuse, , io, trace, 1,
+ SDT_PROBE2(fusefs, , io, trace, 1,
"buffered write of vnode");
- err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag);
+ if (!fsess_opt_writeback(vnode_mount(vp)))
+ ioflag |= IO_SYNC;
+ err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag,
+ pid);
}
+ fuse_io_clear_suid_on_write(vp, cred, uio->uio_td);
break;
default:
panic("uninterpreted mode passed to fuse_io_dispatch");
}
+out:
+ if (closefufh)
+ fuse_filehandle_close(vp, fufh, curthread, cred);
+
return (err);
}
-SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_start, "int", "int", "int");
-SDT_PROBE_DEFINE2(fuse, , io, read_bio_backend_feed, "int", "int");
-SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_end, "int", "ssize_t", "int");
+SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int");
+SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "struct buf*");
+SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int",
+ "struct buf*");
static int
-fuse_read_biobackend(struct vnode *vp, struct uio *uio,
- struct ucred *cred, struct fuse_filehandle *fufh)
+fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
+ struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid)
{
struct buf *bp;
- daddr_t lbn;
- int bcount;
- int err = 0, n = 0, on = 0;
+ struct mount *mp;
+ struct fuse_data *data;
+ daddr_t lbn, nextlbn;
+ int bcount, nextsize;
+ int err, n = 0, on = 0, seqcount;
off_t filesize;
const int biosize = fuse_iosize(vp);
+ mp = vnode_mount(vp);
+ data = fuse_get_mpdata(mp);
- if (uio->uio_resid == 0)
- return (0);
if (uio->uio_offset < 0)
return (EINVAL);
- bcount = biosize;
- filesize = VTOFUD(vp)->filesize;
+ seqcount = ioflag >> IO_SEQSHIFT;
- do {
+ err = fuse_vnode_size(vp, &filesize, cred, curthread);
+ if (err)
+ return err;
+
+ for (err = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
if (fuse_isdeadfs(vp)) {
err = ENXIO;
break;
}
+ if (filesize - uio->uio_offset <= 0)
+ break;
lbn = uio->uio_offset / biosize;
on = uio->uio_offset & (biosize - 1);
- SDT_PROBE3(fuse, , io, read_bio_backend_start,
- biosize, (int)lbn, on);
-
- /*
- * Obtain the buffer cache block. Figure out the buffer size
- * when we are at EOF. If we are modifying the size of the
- * buffer based on an EOF condition we need to hold
- * nfs_rslock() through obtaining the buffer to prevent
- * a potential writer-appender from messing with n_size.
- * Otherwise we may accidentally truncate the buffer and
- * lose dirty data.
- *
- * Note that bcount is *not* DEV_BSIZE aligned.
- */
if ((off_t)lbn * biosize >= filesize) {
bcount = 0;
} else if ((off_t)(lbn + 1) * biosize > filesize) {
bcount = filesize - (off_t)lbn *biosize;
+ } else {
+ bcount = biosize;
}
- bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
+ nextlbn = lbn + 1;
+ nextsize = MIN(biosize, filesize - nextlbn * biosize);
- if (!bp)
- return (EINTR);
+ SDT_PROBE4(fusefs, , io, read_bio_backend_start,
+ biosize, (int)lbn, on, bcount);
- /*
- * If B_CACHE is not set, we must issue the read. If this
- * fails, we return an error.
- */
+ if (bcount < biosize) {
+ /* If near EOF, don't do readahead */
+ err = bread(vp, lbn, bcount, NOCRED, &bp);
+ } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
+ /* Try clustered read */
+ long totread = uio->uio_resid + on;
+ seqcount = MIN(seqcount,
+ data->max_readahead_blocks + 1);
+ err = cluster_read(vp, filesize, lbn, bcount, NOCRED,
+ totread, seqcount, 0, &bp);
+ } else if (seqcount > 1 && data->max_readahead_blocks >= 1) {
+ /* Try non-clustered readahead */
+ err = breadn(vp, lbn, bcount, &nextlbn, &nextsize, 1,
+ NOCRED, &bp);
+ } else {
+ /* Just read what was requested */
+ err = bread(vp, lbn, bcount, NOCRED, &bp);
+ }
- if ((bp->b_flags & B_CACHE) == 0) {
- bp->b_iocmd = BIO_READ;
- vfs_busy_pages(bp, 0);
- err = fuse_io_strategy(vp, bp);
- if (err) {
- brelse(bp);
- return (err);
- }
+ if (err) {
+ brelse(bp);
+ bp = NULL;
+ break;
}
+
/*
* on is the offset into the current bp. Figure out how many
* bytes we can copy out of the bp. Note that bcount is
@@ -264,33 +362,41 @@
*/
n = 0;
- if (on < bcount)
- n = MIN((unsigned)(bcount - on), uio->uio_resid);
+ if (on < bcount - bp->b_resid)
+ n = MIN((unsigned)(bcount - bp->b_resid - on),
+ uio->uio_resid);
if (n > 0) {
- SDT_PROBE2(fuse, , io, read_bio_backend_feed,
- n, n + (int)bp->b_resid);
+ SDT_PROBE2(fusefs, , io, read_bio_backend_feed, n, bp);
err = uiomove(bp->b_data + on, n, uio);
}
- brelse(bp);
- SDT_PROBE3(fuse, , io, read_bio_backend_end, err,
- uio->uio_resid, n);
- } while (err == 0 && uio->uio_resid > 0 && n > 0);
+ vfs_bio_brelse(bp, ioflag);
+ SDT_PROBE4(fusefs, , io, read_bio_backend_end, err,
+ uio->uio_resid, n, bp);
+ if (bp->b_resid > 0) {
+ /* Short read indicates EOF */
+ break;
+ }
+ }
return (err);
}
-SDT_PROBE_DEFINE1(fuse, , io, read_directbackend_start, "struct fuse_read_in*");
-SDT_PROBE_DEFINE2(fuse, , io, read_directbackend_complete,
- "struct fuse_dispatcher*", "struct uio*");
+SDT_PROBE_DEFINE1(fusefs, , io, read_directbackend_start,
+ "struct fuse_read_in*");
+SDT_PROBE_DEFINE3(fusefs, , io, read_directbackend_complete,
+ "struct fuse_dispatcher*", "struct fuse_read_in*", "struct uio*");
static int
fuse_read_directbackend(struct vnode *vp, struct uio *uio,
struct ucred *cred, struct fuse_filehandle *fufh)
{
+ struct fuse_data *data;
struct fuse_dispatcher fdi;
struct fuse_read_in *fri;
int err = 0;
+ data = fuse_get_mpdata(vp->v_mount);
+
if (uio->uio_resid == 0)
return (0);
@@ -312,19 +418,29 @@
fri->offset = uio->uio_offset;
fri->size = MIN(uio->uio_resid,
fuse_get_mpdata(vp->v_mount)->max_read);
+ if (fuse_libabi_geq(data, 7, 9)) {
+ /* See comment regarding FUSE_WRITE_LOCKOWNER */
+ fri->read_flags = 0;
+ fri->flags = fufh_type_2_fflags(fufh->fufh_type);
+ }
- SDT_PROBE1(fuse, , io, read_directbackend_start, fri);
+ SDT_PROBE1(fusefs, , io, read_directbackend_start, fri);
if ((err = fdisp_wait_answ(&fdi)))
goto out;
- SDT_PROBE2(fuse, , io, read_directbackend_complete,
- fdi.iosize, uio);
+ SDT_PROBE3(fusefs, , io, read_directbackend_complete,
+ &fdi, fri, uio);
if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio)))
break;
- if (fdi.iosize < fri->size)
+ if (fdi.iosize < fri->size) {
+ /*
+ * Short read. Should only happen at EOF or with
+ * direct io.
+ */
break;
+ }
}
out:
@@ -334,25 +450,57 @@
static int
fuse_write_directbackend(struct vnode *vp, struct uio *uio,
- struct ucred *cred, struct fuse_filehandle *fufh, int ioflag)
+ struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize,
+ int ioflag, bool pages)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
+ struct fuse_data *data;
struct fuse_write_in *fwi;
+ struct fuse_write_out *fwo;
struct fuse_dispatcher fdi;
size_t chunksize;
+ void *fwi_data;
+ off_t as_written_offset;
int diff;
int err = 0;
+ bool direct_io = fufh->fuse_open_flags & FOPEN_DIRECT_IO;
+ bool wrote_anything = false;
+ uint32_t write_flags;
+ data = fuse_get_mpdata(vp->v_mount);
+
+ /*
+ * Don't set FUSE_WRITE_LOCKOWNER in write_flags. It can't be set
+ * accurately when using POSIX AIO, libfuse doesn't use it, and I'm not
+ * aware of any file systems that do. It was an attempt to add
+ * Linux-style mandatory locking to the FUSE protocol, but mandatory
+ * locking is deprecated even on Linux. See Linux commit
+ * f33321141b273d60cbb3a8f56a5489baad82ba5e .
+ */
+ /*
+ * Set FUSE_WRITE_CACHE whenever we don't know the uid, gid, and/or pid
+ * that originated a write. For example when writing from the
+ * writeback cache. I don't know of a single file system that cares,
+ * but the protocol says we're supposed to do this.
+ */
+ write_flags = !pages && (
+ (ioflag & IO_DIRECT) ||
+ !fsess_opt_datacache(vnode_mount(vp)) ||
+ !fsess_opt_writeback(vnode_mount(vp))) ? 0 : FUSE_WRITE_CACHE;
+
if (uio->uio_resid == 0)
return (0);
+
if (ioflag & IO_APPEND)
- uio_setoffset(uio, fvdat->filesize);
+ uio_setoffset(uio, filesize);
+ if (vn_rlimit_fsize(vp, uio, uio->uio_td))
+ return (EFBIG);
+
fdisp_init(&fdi, 0);
while (uio->uio_resid > 0) {
- chunksize = MIN(uio->uio_resid,
- fuse_get_mpdata(vp->v_mount)->max_write);
+ chunksize = MIN(uio->uio_resid, data->max_write);
fdi.iosize = sizeof(*fwi) + chunksize;
fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred);
@@ -361,79 +509,140 @@
fwi->fh = fufh->fh_id;
fwi->offset = uio->uio_offset;
fwi->size = chunksize;
+ fwi->write_flags = write_flags;
+ if (fuse_libabi_geq(data, 7, 9)) {
+ fwi->flags = fufh_type_2_fflags(fufh->fufh_type);
+ fwi_data = (char *)fdi.indata + sizeof(*fwi);
+ } else {
+ fwi_data = (char *)fdi.indata +
+ FUSE_COMPAT_WRITE_IN_SIZE;
+ }
- if ((err = uiomove((char *)fdi.indata + sizeof(*fwi),
- chunksize, uio)))
+ if ((err = uiomove(fwi_data, chunksize, uio)))
break;
- if ((err = fdisp_wait_answ(&fdi)))
+retry:
+ err = fdisp_wait_answ(&fdi);
+ if (err == ERESTART || err == EINTR || err == EWOULDBLOCK) {
+ /*
+ * Rewind the uio so dofilewrite will know it's
+ * incomplete
+ */
+ uio->uio_resid += fwi->size;
+ uio->uio_offset -= fwi->size;
+ /*
+ * Change ERESTART into EINTR because we can't rewind
+ * uio->uio_iov. Basically, once uiomove(9) has been
+ * called, it's impossible to restart a syscall.
+ */
+ if (err == ERESTART)
+ err = EINTR;
break;
+ } else if (err) {
+ break;
+ } else {
+ wrote_anything = true;
+ }
+ fwo = ((struct fuse_write_out *)fdi.answ);
+
/* Adjust the uio in the case of short writes */
- diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size;
+ diff = fwi->size - fwo->size;
+ as_written_offset = uio->uio_offset - diff;
+
+ if (as_written_offset - diff > filesize)
+ fuse_vnode_setsize(vp, as_written_offset);
+ if (as_written_offset - diff >= filesize)
+ fvdat->flag &= ~FN_SIZECHANGE;
+
if (diff < 0) {
+ printf("WARNING: misbehaving FUSE filesystem "
+ "wrote more data than we provided it\n");
err = EINVAL;
break;
- } else if (diff > 0 && !(ioflag & IO_DIRECT)) {
- /*
- * XXX We really should be directly checking whether
- * the file was opened with FOPEN_DIRECT_IO, not
- * IO_DIRECT. IO_DIRECT can be set in multiple ways.
- */
- SDT_PROBE2(fuse, , io, trace, 1,
- "misbehaving filesystem: short writes are only "
- "allowed with direct_io");
+ } else if (diff > 0) {
+ /* Short write */
+ if (!direct_io) {
+ printf("WARNING: misbehaving FUSE filesystem: "
+ "short writes are only allowed with "
+ "direct_io\n");
+ }
+ if (ioflag & IO_DIRECT) {
+ /* Return early */
+ uio->uio_resid += diff;
+ uio->uio_offset -= diff;
+ break;
+ } else {
+ /* Resend the unwritten portion of data */
+ fdi.iosize = sizeof(*fwi) + diff;
+ /* Refresh fdi without clearing data buffer */
+ fdisp_refresh_vp(&fdi, FUSE_WRITE, vp,
+ uio->uio_td, cred);
+ fwi = fdi.indata;
+ MPASS2(fwi == fdi.indata, "FUSE dispatcher "
+ "reallocated despite no increase in "
+ "size?");
+ void *src = (char*)fwi_data + fwo->size;
+ memmove(fwi_data, src, diff);
+ fwi->fh = fufh->fh_id;
+ fwi->offset = as_written_offset;
+ fwi->size = diff;
+ fwi->write_flags = write_flags;
+ goto retry;
+ }
}
- uio->uio_resid += diff;
- uio->uio_offset -= diff;
-
- if (uio->uio_offset > fvdat->filesize &&
- fuse_data_cache_mode != FUSE_CACHE_UC) {
- fuse_vnode_setsize(vp, uio->uio_offset);
- fvdat->flag &= ~FN_SIZECHANGE;
- }
}
fdisp_destroy(&fdi);
+ if (wrote_anything)
+ fuse_vnode_undirty_cached_timestamps(vp);
+
return (err);
}
-SDT_PROBE_DEFINE6(fuse, , io, write_biobackend_start, "int64_t", "int", "int",
+SDT_PROBE_DEFINE6(fusefs, , io, write_biobackend_start, "int64_t", "int", "int",
"struct uio*", "int", "bool");
-SDT_PROBE_DEFINE2(fuse, , io, write_biobackend_append_race, "long", "int");
+SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_append_race, "long", "int");
+SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_issue, "int", "struct buf*");
static int
fuse_write_biobackend(struct vnode *vp, struct uio *uio,
- struct ucred *cred, struct fuse_filehandle *fufh, int ioflag)
+ struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
struct buf *bp;
daddr_t lbn;
+ off_t filesize;
int bcount;
- int n, on, err = 0;
+ int n, on, seqcount, err = 0;
+ bool last_page;
const int biosize = fuse_iosize(vp);
- KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
+ seqcount = ioflag >> IO_SEQSHIFT;
+
+ KASSERT(uio->uio_rw == UIO_WRITE, ("fuse_write_biobackend mode"));
if (vp->v_type != VREG)
return (EIO);
if (uio->uio_offset < 0)
return (EINVAL);
if (uio->uio_resid == 0)
return (0);
+
+ err = fuse_vnode_size(vp, &filesize, cred, curthread);
+ if (err)
+ return err;
+
if (ioflag & IO_APPEND)
- uio_setoffset(uio, fvdat->filesize);
+ uio_setoffset(uio, filesize);
- /*
- * Find all of this file's B_NEEDCOMMIT buffers. If our writes
- * would exceed the local maximum per-file write commit size when
- * combined with those, we must decide whether to flush,
- * go synchronous, or return err. We don't bother checking
- * IO_UNIT -- we just make all writes atomic anyway, as there's
- * no point optimizing for something that really won't ever happen.
- */
+ if (vn_rlimit_fsize(vp, uio, uio->uio_td))
+ return (EFBIG);
+
do {
+ bool direct_append, extending;
+
if (fuse_isdeadfs(vp)) {
err = ENXIO;
break;
@@ -443,66 +652,60 @@
n = MIN((unsigned)(biosize - on), uio->uio_resid);
again:
- /*
- * Handle direct append and file extension cases, calculate
- * unaligned buffer size.
- */
- if (uio->uio_offset == fvdat->filesize && n) {
- /*
- * Get the buffer (in its pre-append state to maintain
- * B_CACHE if it was previously set). Resize the
- * nfsnode after we have locked the buffer to prevent
- * readers from reading garbage.
- */
- bcount = on;
- SDT_PROBE6(fuse, , io, write_biobackend_start,
- lbn, on, n, uio, bcount, true);
- bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
-
+ /* Get or create a buffer for the write */
+ direct_append = uio->uio_offset == filesize && n;
+ if (uio->uio_offset + n < filesize) {
+ extending = false;
+ if ((off_t)(lbn + 1) * biosize < filesize) {
+ /* Not the file's last block */
+ bcount = biosize;
+ } else {
+ /* The file's last block */
+ bcount = filesize - (off_t)lbn * biosize;
+ }
+ } else {
+ extending = true;
+ bcount = on + n;
+ }
+ if (howmany(((off_t)lbn * biosize + on + n - 1), PAGE_SIZE) >=
+ howmany(filesize, PAGE_SIZE))
+ last_page = true;
+ else
+ last_page = false;
+ if (direct_append) {
+ /*
+ * Take care to preserve the buffer's B_CACHE state so
+ * as not to cause an unnecessary read.
+ */
+ bp = getblk(vp, lbn, on, PCATCH, 0, 0);
if (bp != NULL) {
- long save;
-
- err = fuse_vnode_setsize(vp,
- uio->uio_offset + n);
- if (err) {
- brelse(bp);
- break;
- }
- save = bp->b_flags & B_CACHE;
- bcount += n;
+ uint32_t save = bp->b_flags & B_CACHE;
allocbuf(bp, bcount);
bp->b_flags |= save;
}
} else {
- /*
- * Obtain the locked cache block first, and then
- * adjust the file's size as appropriate.
- */
- bcount = on + n;
- if ((off_t)lbn * biosize + bcount < fvdat->filesize) {
- if ((off_t)(lbn + 1) * biosize < fvdat->filesize)
- bcount = biosize;
- else
- bcount = fvdat->filesize -
- (off_t)lbn *biosize;
- }
- SDT_PROBE6(fuse, , io, write_biobackend_start,
- lbn, on, n, uio, bcount, false);
bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
- if (bp && uio->uio_offset + n > fvdat->filesize) {
- err = fuse_vnode_setsize(vp,
- uio->uio_offset + n);
- if (err) {
- brelse(bp);
- break;
- }
- }
}
-
if (!bp) {
err = EINTR;
break;
}
+ if (extending) {
+ /*
+ * Extend file _after_ locking buffer so we won't race
+ * with other readers
+ */
+ err = fuse_vnode_setsize(vp, uio->uio_offset + n);
+ filesize = uio->uio_offset + n;
+ fvdat->flag |= FN_SIZECHANGE;
+ if (err) {
+ brelse(bp);
+ break;
+ }
+ }
+
+ SDT_PROBE6(fusefs, , io, write_biobackend_start,
+ lbn, on, n, uio, bcount, direct_append);
/*
* Issue a READ if B_CACHE is not set. In special-append
* mode, B_CACHE is based on the buffer prior to the write
@@ -535,6 +738,21 @@
brelse(bp);
break;
}
+ if (bp->b_resid > 0) {
+ /*
+ * Short read indicates EOF. Update file size
+ * from the server and try again.
+ */
+ SDT_PROBE2(fusefs, , io, trace, 1,
+ "Short read during a RMW");
+ brelse(bp);
+ err = fuse_vnode_size(vp, &filesize, cred,
+ curthread);
+ if (err)
+ break;
+ else
+ goto again;
+ }
}
if (bp->b_wcred == NOCRED)
bp->b_wcred = crhold(cred);
@@ -547,9 +765,8 @@
* If the chopping creates a reverse-indexed or degenerate
* situation with dirtyoff/end, we 0 both of them.
*/
-
if (bp->b_dirtyend > bcount) {
- SDT_PROBE2(fuse, , io, write_biobackend_append_race,
+ SDT_PROBE2(fusefs, , io, write_biobackend_append_race,
(long)bp->b_blkno * biosize,
bp->b_dirtyend - bcount);
bp->b_dirtyend = bcount;
@@ -582,6 +799,7 @@
* reasons: the only way to know if a write is valid
* if its actually written out.)
*/
+ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 0, bp);
bwrite(bp);
if (bp->b_error == EINTR) {
err = EINTR;
@@ -591,19 +809,12 @@
}
err = uiomove((char *)bp->b_data + on, n, uio);
- /*
- * Since this block is being modified, it must be written
- * again and not just committed. Since write clustering does
- * not work for the stage 1 data write, only the stage 2
- * commit rpc, we have to clear B_CLUSTEROK as well.
- */
- bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
-
if (err) {
bp->b_ioflags |= BIO_ERROR;
bp->b_error = err;
brelse(bp);
break;
+ /* TODO: vfs_bio_clrbuf like ffs_write does? */
}
/*
* Only update dirtyoff/dirtyend if not a degenerate
@@ -619,42 +830,85 @@
}
vfs_bio_set_valid(bp, on, n);
}
- err = bwrite(bp);
+
+ vfs_bio_set_flags(bp, ioflag);
+
+ bp->b_flags |= B_FUSEFS_WRITE_CACHE;
+ if (ioflag & IO_SYNC) {
+ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 2, bp);
+ if (!(ioflag & IO_VMIO))
+ bp->b_flags &= ~B_FUSEFS_WRITE_CACHE;
+ err = bwrite(bp);
+ } else if (vm_page_count_severe() ||
+ buf_dirty_count_severe() ||
+ (ioflag & IO_ASYNC)) {
+ bp->b_flags |= B_CLUSTEROK;
+ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 3, bp);
+ bawrite(bp);
+ } else if (on == 0 && n == bcount) {
+ if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
+ bp->b_flags |= B_CLUSTEROK;
+ SDT_PROBE2(fusefs, , io, write_biobackend_issue,
+ 4, bp);
+ cluster_write(vp, bp, filesize, seqcount, 0);
+ } else {
+ SDT_PROBE2(fusefs, , io, write_biobackend_issue,
+ 5, bp);
+ bawrite(bp);
+ }
+ } else if (ioflag & IO_DIRECT) {
+ bp->b_flags |= B_CLUSTEROK;
+ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 6, bp);
+ bawrite(bp);
+ } else {
+ bp->b_flags &= ~B_CLUSTEROK;
+ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 7, bp);
+ bdwrite(bp);
+ }
if (err)
break;
} while (uio->uio_resid > 0 && n > 0);
- if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0)
- fuse_vnode_savesize(vp, cred);
-
return (err);
}
int
fuse_io_strategy(struct vnode *vp, struct buf *bp)
{
- struct fuse_filehandle *fufh;
struct fuse_vnode_data *fvdat = VTOFUD(vp);
+ struct fuse_filehandle *fufh;
struct ucred *cred;
struct uio *uiop;
struct uio uio;
struct iovec io;
+ off_t filesize;
int error = 0;
+ int fflag;
+ /* We don't know the true pid when we're dealing with the cache */
+ pid_t pid = 0;
const int biosize = fuse_iosize(vp);
MPASS(vp->v_type == VREG || vp->v_type == VDIR);
MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE);
- error = fuse_filehandle_getrw(vp,
- (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh);
+ fflag = bp->b_iocmd == BIO_READ ? FREAD : FWRITE;
+ cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred;
+ error = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
+ if (bp->b_iocmd == BIO_READ && error == EBADF) {
+ /*
+ * This may be a read-modify-write operation on a cached file
+ * opened O_WRONLY. The FUSE protocol allows this.
+ */
+ error = fuse_filehandle_get(vp, FWRITE, &fufh, cred, pid);
+ }
if (error) {
printf("FUSE: strategy: filehandles are closed\n");
bp->b_ioflags |= BIO_ERROR;
bp->b_error = error;
+ bufdone(bp);
return (error);
}
- cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred;
uiop = &uio;
uiop->uio_iov = &io;
@@ -673,40 +927,57 @@
KASSERT(!(bp->b_flags & B_DONE),
("fuse_io_strategy: bp %p already marked done", bp));
if (bp->b_iocmd == BIO_READ) {
+ ssize_t left;
+
io.iov_len = uiop->uio_resid = bp->b_bcount;
io.iov_base = bp->b_data;
uiop->uio_rw = UIO_READ;
- uiop->uio_offset = ((off_t)bp->b_blkno) * biosize;
+ uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize;
error = fuse_read_directbackend(vp, uiop, cred, fufh);
+ /*
+ * Store the amount we failed to read in the buffer's private
+ * field, so callers can truncate the file if necessary'
+ */
- /* XXXCEM: Potentially invalid access to cached_attrs here */
- if ((!error && uiop->uio_resid) ||
- (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO &&
- uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 &&
- uiop->uio_offset >= fvdat->cached_attrs.va_size)) {
- /*
- * If we had a short read with no error, we must have
- * hit a file hole. We should zero-fill the remainder.
- * This can also occur if the server hits the file EOF.
- *
- * Holes used to be able to occur due to pending
- * writes, but that is not possible any longer.
- */
+ if (!error && uiop->uio_resid) {
int nread = bp->b_bcount - uiop->uio_resid;
- int left = uiop->uio_resid;
+ left = uiop->uio_resid;
+ bzero((char *)bp->b_data + nread, left);
- if (error != 0) {
- printf("FUSE: Fix broken io: offset %ju, "
- " resid %zd, file size %ju/%ju\n",
- (uintmax_t)uiop->uio_offset,
- uiop->uio_resid, fvdat->filesize,
- fvdat->cached_attrs.va_size);
- error = 0;
+ if ((fvdat->flag & FN_SIZECHANGE) == 0) {
+ /*
+ * A short read with no error, when not using
+ * direct io, and when no writes are cached,
+ * indicates EOF caused by a server-side
+ * truncation. Clear the attr cache so we'll
+ * pick up the new file size and timestamps.
+ *
+ * We must still bzero the remaining buffer so
+ * uninitialized data doesn't get exposed by a
+ * future truncate that extends the file.
+ *
+ * To prevent lock order problems, we must
+ * truncate the file upstack, not here.
+ */
+ SDT_PROBE2(fusefs, , io, trace, 1,
+ "Short read of a clean file");
+ fuse_vnode_clear_attr_cache(vp);
+ } else {
+ /*
+ * If dirty writes _are_ cached beyond EOF,
+ * that indicates a newly created hole that the
+ * server doesn't know about. Those don't pose
+ * any problem.
+ * XXX: we don't currently track whether dirty
+ * writes are cached beyond EOF, before EOF, or
+ * both.
+ */
+ SDT_PROBE2(fusefs, , io, trace, 1,
+ "Short read of a dirty file");
+ uiop->uio_resid = 0;
}
- if (left > 0)
- bzero((char *)bp->b_data + nread, left);
- uiop->uio_resid = 0;
+
}
if (error) {
bp->b_ioflags |= BIO_ERROR;
@@ -714,33 +985,33 @@
}
} else {
/*
- * If we only need to commit, try to commit
- */
- if (bp->b_flags & B_NEEDCOMMIT) {
- SDT_PROBE2(fuse, , io, trace, 1,
- "write: B_NEEDCOMMIT flags set");
- }
- /*
* Setup for actual write
*/
- if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend >
- fvdat->filesize)
- bp->b_dirtyend = fvdat->filesize -
- (off_t)bp->b_blkno * biosize;
+ error = fuse_vnode_size(vp, &filesize, cred, curthread);
+ if (error) {
+ bp->b_ioflags |= BIO_ERROR;
+ bp->b_error = error;
+ bufdone(bp);
+ return (error);
+ }
+ if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize)
+ bp->b_dirtyend = filesize -
+ (off_t)bp->b_lblkno * biosize;
+
if (bp->b_dirtyend > bp->b_dirtyoff) {
io.iov_len = uiop->uio_resid = bp->b_dirtyend
- bp->b_dirtyoff;
- uiop->uio_offset = (off_t)bp->b_blkno * biosize
+ uiop->uio_offset = (off_t)bp->b_lblkno * biosize
+ bp->b_dirtyoff;
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
uiop->uio_rw = UIO_WRITE;
- error = fuse_write_directbackend(vp, uiop, cred, fufh, 0);
+ bool pages = bp->b_flags & B_FUSEFS_WRITE_CACHE;
+ error = fuse_write_directbackend(vp, uiop, cred, fufh,
+ filesize, 0, pages);
- if (error == EINTR || error == ETIMEDOUT
- || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
-
+ if (error == EINTR || error == ETIMEDOUT) {
bp->b_flags &= ~(B_INVAL | B_NOCACHE);
if ((bp->b_flags & B_PAGING) == 0) {
bdirty(bp);
Index: sys/fs/fuse/fuse_ipc.h
===================================================================
--- sys/fs/fuse/fuse_ipc.h
+++ sys/fs/fuse/fuse_ipc.h
@@ -32,6 +32,11 @@
*
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -63,6 +68,12 @@
#include <sys/param.h>
#include <sys/refcount.h>
+enum fuse_data_cache_mode {
+ FUSE_CACHE_UC,
+ FUSE_CACHE_WT,
+ FUSE_CACHE_WB,
+};
+
struct fuse_iov {
void *base;
size_t len;
@@ -103,6 +114,12 @@
struct fuse_data *tk_data;
int tk_flag;
u_int tk_refcount;
+ /*
+ * If this ticket's operation has been interrupted, this will hold the
+ * unique value of the FUSE_INTERRUPT operation. Otherwise, it will be
+ * 0.
+ */
+ uint64_t irq_unique;
/* fields for initiating an upgoing message */
struct fuse_iov tk_ms_fiov;
@@ -147,16 +164,20 @@
ftick->tk_flag |= FT_ANSW;
}
+static inline struct fuse_in_header*
+fticket_in_header(struct fuse_ticket *ftick)
+{
+ return (struct fuse_in_header *)(ftick->tk_ms_fiov.base);
+}
+
static inline enum fuse_opcode
fticket_opcode(struct fuse_ticket *ftick)
{
- return (((struct fuse_in_header *)(ftick->tk_ms_fiov.base))->opcode);
+ return fticket_in_header(ftick)->opcode;
}
int fticket_pull(struct fuse_ticket *ftick, struct uio *uio);
-enum mountpri { FM_NOMOUNTED, FM_PRIMARY, FM_SECONDARY };
-
/*
* The data representing a FUSE session.
*/
@@ -170,10 +191,16 @@
struct mtx ms_mtx;
STAILQ_HEAD(, fuse_ticket) ms_head;
+ int ms_count;
struct mtx aw_mtx;
TAILQ_HEAD(, fuse_ticket) aw_head;
+ /*
+ * Holds the next value of the FUSE operation unique value.
+ * Also, serves as a wakeup channel to prevent any operations from
+ * being created before INIT completes.
+ */
u_long ticketer;
struct sx rename_lock;
@@ -181,6 +208,7 @@
uint32_t fuse_libabi_major;
uint32_t fuse_libabi_minor;
+ uint32_t max_readahead_blocks;
uint32_t max_write;
uint32_t max_read;
uint32_t subtype;
@@ -189,34 +217,27 @@
struct selinfo ks_rsel;
int daemon_timeout;
+ unsigned time_gran;
uint64_t notimpl;
+ uint64_t mnt_flag;
+ enum fuse_data_cache_mode cache_mode;
};
#define FSESS_DEAD 0x0001 /* session is to be closed */
-#define FSESS_UNUSED0 0x0002 /* unused */
#define FSESS_INITED 0x0004 /* session has been inited */
#define FSESS_DAEMON_CAN_SPY 0x0010 /* let non-owners access this fs */
/* (and being observed by the daemon) */
#define FSESS_PUSH_SYMLINKS_IN 0x0020 /* prefix absolute symlinks with mp */
#define FSESS_DEFAULT_PERMISSIONS 0x0040 /* kernel does permission checking */
-#define FSESS_NO_ATTRCACHE 0x0080 /* no attribute caching */
-#define FSESS_NO_READAHEAD 0x0100 /* no readaheads */
-#define FSESS_NO_DATACACHE 0x0200 /* disable buffer cache */
-#define FSESS_NO_NAMECACHE 0x0400 /* disable name cache */
-#define FSESS_NO_MMAP 0x0800 /* disable mmap */
-#define FSESS_BROKENIO 0x1000 /* fix broken io */
+#define FSESS_ASYNC_READ 0x1000 /* allow multiple reads of some file */
+#define FSESS_POSIX_LOCKS 0x2000 /* daemon supports POSIX locks */
+#define FSESS_EXPORT_SUPPORT 0x10000 /* daemon supports NFS-style lookups */
+#define FSESS_INTR 0x20000 /* interruptible mounts */
+#define FSESS_MNTOPTS_MASK ( \
+ FSESS_DAEMON_CAN_SPY | FSESS_PUSH_SYMLINKS_IN | \
+ FSESS_DEFAULT_PERMISSIONS | FSESS_INTR)
-enum fuse_data_cache_mode {
- FUSE_CACHE_UC,
- FUSE_CACHE_WT,
- FUSE_CACHE_WB,
-};
-
extern int fuse_data_cache_mode;
-extern int fuse_data_cache_invalidate;
-extern int fuse_mmap_enable;
-extern int fuse_sync_resize;
-extern int fuse_fix_broken_io;
static inline struct fuse_data *
fuse_get_mpdata(struct mount *mp)
@@ -245,36 +266,43 @@
{
struct fuse_data *data = fuse_get_mpdata(mp);
- return (fuse_data_cache_mode != FUSE_CACHE_UC &&
- (data->dataflags & FSESS_NO_DATACACHE) == 0);
+ return (data->cache_mode != FUSE_CACHE_UC);
}
static inline bool
fsess_opt_mmap(struct mount *mp)
{
- struct fuse_data *data = fuse_get_mpdata(mp);
-
- if (!fuse_mmap_enable || fuse_data_cache_mode == FUSE_CACHE_UC)
- return (false);
- return ((data->dataflags & (FSESS_NO_DATACACHE | FSESS_NO_MMAP)) == 0);
+ return (fsess_opt_datacache(mp));
}
static inline bool
-fsess_opt_brokenio(struct mount *mp)
+fsess_opt_writeback(struct mount *mp)
{
struct fuse_data *data = fuse_get_mpdata(mp);
- return (fuse_fix_broken_io || (data->dataflags & FSESS_BROKENIO));
+ return (data->cache_mode == FUSE_CACHE_WB);
}
+/* Insert a new upgoing message */
static inline void
fuse_ms_push(struct fuse_ticket *ftick)
{
mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED);
refcount_acquire(&ftick->tk_refcount);
STAILQ_INSERT_TAIL(&ftick->tk_data->ms_head, ftick, tk_ms_link);
+ ftick->tk_data->ms_count++;
}
+/* Insert a new upgoing message to the front of the queue */
+static inline void
+fuse_ms_push_head(struct fuse_ticket *ftick)
+{
+ mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED);
+ refcount_acquire(&ftick->tk_refcount);
+ STAILQ_INSERT_HEAD(&ftick->tk_data->ms_head, ftick, tk_ms_link);
+ ftick->tk_data->ms_count++;
+}
+
static inline struct fuse_ticket *
fuse_ms_pop(struct fuse_data *data)
{
@@ -284,7 +312,9 @@
if ((ftick = STAILQ_FIRST(&data->ms_head))) {
STAILQ_REMOVE_HEAD(&data->ms_head, tk_ms_link);
+ data->ms_count--;
#ifdef INVARIANTS
+ MPASS(data->ms_count >= 0);
ftick->tk_ms_link.stqe_next = NULL;
#endif
}
@@ -327,7 +357,7 @@
struct fuse_ticket *fuse_ticket_fetch(struct fuse_data *data);
int fuse_ticket_drop(struct fuse_ticket *ftick);
void fuse_insert_callback(struct fuse_ticket *ftick, fuse_handler_t *handler);
-void fuse_insert_message(struct fuse_ticket *ftick);
+void fuse_insert_message(struct fuse_ticket *ftick, bool irq);
static inline bool
fuse_libabi_geq(struct fuse_data *data, uint32_t abi_maj, uint32_t abi_min)
@@ -374,13 +404,15 @@
#endif
}
+void fdisp_refresh(struct fuse_dispatcher *fdip);
+
void fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred);
-void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
- struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred);
-
void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
+ struct vnode *vp, struct thread *td, struct ucred *cred);
+
+void fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct vnode *vp, struct thread *td, struct ucred *cred);
int fdisp_wait_answ(struct fuse_dispatcher *fdip);
Index: sys/fs/fuse/fuse_ipc.c
===================================================================
--- sys/fs/fuse/fuse_ipc.c
+++ sys/fs/fuse/fuse_ipc.c
@@ -33,6 +33,11 @@
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
*
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -61,6 +66,7 @@
#include <sys/param.h>
#include <sys/module.h>
#include <sys/systm.h>
+#include <sys/counter.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/conf.h>
@@ -84,14 +90,17 @@
#include "fuse_ipc.h"
#include "fuse_internal.h"
-SDT_PROVIDER_DECLARE(fuse);
+SDT_PROVIDER_DECLARE(fusefs);
/*
* Fuse trace probe:
* arg0: verbosity. Higher numbers give more verbose messages
* arg1: Textual message
*/
-SDT_PROBE_DEFINE2(fuse, , ipc, trace, "int", "char*");
+SDT_PROBE_DEFINE2(fusefs, , ipc, trace, "int", "char*");
+static void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
+ struct fuse_data *data, uint64_t nid, pid_t pid, struct ucred *cred);
+static void fuse_interrupt_send(struct fuse_ticket *otick, int err);
static struct fuse_ticket *fticket_alloc(struct fuse_data *data);
static void fticket_refresh(struct fuse_ticket *ftick);
static void fticket_destroy(struct fuse_ticket *ftick);
@@ -104,13 +113,10 @@
static fuse_handler_t fuse_standard_handler;
-SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW, 0, "FUSE tunables");
-SYSCTL_STRING(_vfs_fusefs, OID_AUTO, version, CTLFLAG_RD,
- FUSE_FREEBSD_VERSION, 0, "fuse-freebsd version");
-static int fuse_ticket_count = 0;
+static counter_u64_t fuse_ticket_count;
+SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, ticket_count, CTLFLAG_RD,
+ &fuse_ticket_count, "Number of allocated tickets");
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, ticket_count, CTLFLAG_RW,
- &fuse_ticket_count, 0, "number of allocated tickets");
static long fuse_iov_permanent_bufsize = 1 << 19;
SYSCTL_LONG(_vfs_fusefs, OID_AUTO, iov_permanent_bufsize, CTLFLAG_RW,
@@ -125,25 +131,131 @@
MALLOC_DEFINE(M_FUSEMSG, "fuse_msgbuf", "fuse message buffer");
static uma_zone_t ticket_zone;
-static void
-fuse_block_sigs(sigset_t *oldset)
+/*
+ * TODO: figure out how to timeout INTERRUPT requests, because the daemon may
+ * leagally never respond
+ */
+static int
+fuse_interrupt_callback(struct fuse_ticket *tick, struct uio *uio)
{
- sigset_t newset;
+ struct fuse_ticket *otick, *x_tick;
+ struct fuse_interrupt_in *fii;
+ struct fuse_data *data = tick->tk_data;
+ bool found = false;
- SIGFILLSET(newset);
- SIGDELSET(newset, SIGKILL);
- if (kern_sigprocmask(curthread, SIG_BLOCK, &newset, oldset, 0))
- panic("%s: Invalid operation for kern_sigprocmask()",
- __func__);
+ fii = (struct fuse_interrupt_in*)((char*)tick->tk_ms_fiov.base +
+ sizeof(struct fuse_in_header));
+
+ fuse_lck_mtx_lock(data->aw_mtx);
+ TAILQ_FOREACH_SAFE(otick, &data->aw_head, tk_aw_link, x_tick) {
+ if (otick->tk_unique == fii->unique) {
+ found = true;
+ break;
+ }
+ }
+ fuse_lck_mtx_unlock(data->aw_mtx);
+
+ if (!found) {
+ /* Original is already complete. Just return */
+ return 0;
+ }
+
+ /* Clear the original ticket's interrupt association */
+ otick->irq_unique = 0;
+
+ if (tick->tk_aw_ohead.error == ENOSYS) {
+ fsess_set_notimpl(data->mp, FUSE_INTERRUPT);
+ return 0;
+ } else if (tick->tk_aw_ohead.error == EAGAIN) {
+ /*
+ * There are two reasons we might get this:
+ * 1) the daemon received the INTERRUPT request before the
+ * original, or
+ * 2) the daemon received the INTERRUPT request after it
+ * completed the original request.
+ * In the first case we should re-send the INTERRUPT. In the
+ * second, we should ignore it.
+ */
+ /* Resend */
+ fuse_interrupt_send(otick, EINTR);
+ return 0;
+ } else {
+ /* Illegal FUSE_INTERRUPT response */
+ return EINVAL;
+ }
}
-static void
-fuse_restore_sigs(sigset_t *oldset)
+/* Interrupt the operation otick. Return err as its error code */
+void
+fuse_interrupt_send(struct fuse_ticket *otick, int err)
{
+ struct fuse_dispatcher fdi;
+ struct fuse_interrupt_in *fii;
+ struct fuse_in_header *ftick_hdr;
+ struct fuse_data *data = otick->tk_data;
+ struct fuse_ticket *tick, *xtick;
+ struct ucred reused_creds;
+ gid_t reused_groups[1];
- if (kern_sigprocmask(curthread, SIG_SETMASK, oldset, NULL, 0))
- panic("%s: Invalid operation for kern_sigprocmask()",
- __func__);
+ if (otick->irq_unique == 0) {
+ /*
+ * If the daemon hasn't yet received otick, then we can answer
+ * it ourselves and return.
+ */
+ fuse_lck_mtx_lock(data->ms_mtx);
+ STAILQ_FOREACH_SAFE(tick, &otick->tk_data->ms_head, tk_ms_link,
+ xtick) {
+ if (tick == otick) {
+ STAILQ_REMOVE(&otick->tk_data->ms_head, tick,
+ fuse_ticket, tk_ms_link);
+ otick->tk_data->ms_count--;
+ otick->tk_ms_link.stqe_next = NULL;
+ fuse_lck_mtx_unlock(data->ms_mtx);
+
+ fuse_lck_mtx_lock(otick->tk_aw_mtx);
+ if (!fticket_answered(otick)) {
+ fticket_set_answered(otick);
+ otick->tk_aw_errno = err;
+ wakeup(otick);
+ }
+ fuse_lck_mtx_unlock(otick->tk_aw_mtx);
+
+ fuse_ticket_drop(tick);
+ return;
+ }
+ }
+ fuse_lck_mtx_unlock(data->ms_mtx);
+
+ /*
+ * If the fuse daemon doesn't support interrupts, then there's
+ * nothing more that we can do
+ */
+ if (!fsess_isimpl(data->mp, FUSE_INTERRUPT))
+ return;
+
+ /*
+ * If the fuse daemon has already received otick, then we must
+ * send FUSE_INTERRUPT.
+ */
+ ftick_hdr = fticket_in_header(otick);
+ reused_creds.cr_uid = ftick_hdr->uid;
+ reused_groups[0] = ftick_hdr->gid;
+ reused_creds.cr_groups = reused_groups;
+ fdisp_init(&fdi, sizeof(*fii));
+ fdisp_make_pid(&fdi, FUSE_INTERRUPT, data, ftick_hdr->nodeid,
+ ftick_hdr->pid, &reused_creds);
+
+ fii = fdi.indata;
+ fii->unique = otick->tk_unique;
+ fuse_insert_callback(fdi.tick, fuse_interrupt_callback);
+
+ otick->irq_unique = fdi.tick->tk_unique;
+ /* Interrupt ops should be delivered ASAP */
+ fuse_insert_message(fdi.tick, true);
+ fdisp_destroy(&fdi);
+ } else {
+ /* This ticket has already been interrupted */
+ }
}
void
@@ -181,14 +293,19 @@
}
fiov->allocated_size = FU_AT_LEAST(size);
fiov->credit = fuse_iov_credit;
+ /* Clear data buffer after reallocation */
+ bzero(fiov->base, size);
+ } else if (size > fiov->len) {
+ /* Clear newly extended portion of data buffer */
+ bzero((char*)fiov->base + fiov->len, size - fiov->len);
}
fiov->len = size;
}
+/* Resize the fiov if needed, and clear it's buffer */
void
fiov_refresh(struct fuse_iov *fiov)
{
- bzero(fiov->base, fiov->len);
fiov_adjust(fiov, 0);
}
@@ -211,8 +328,10 @@
if (ftick->tk_unique == 0)
ftick->tk_unique = atomic_fetchadd_long(&data->ticketer, 1);
+ ftick->irq_unique = 0;
+
refcount_init(&ftick->tk_refcount, 1);
- atomic_add_acq_int(&fuse_ticket_count, 1);
+ counter_u64_add(fuse_ticket_count, 1);
return 0;
}
@@ -227,7 +346,7 @@
FUSE_ASSERT_MS_DONE(ftick);
FUSE_ASSERT_AW_DONE(ftick);
- atomic_subtract_acq_int(&fuse_ticket_count, 1);
+ counter_u64_add(fuse_ticket_count, -1);
}
static int
@@ -269,7 +388,7 @@
return uma_zfree(ticket_zone, ftick);
}
-static inline
+static inline
void
fticket_refresh(struct fuse_ticket *ftick)
{
@@ -292,30 +411,65 @@
ftick->tk_flag = 0;
}
+/* Prepar the ticket to be reused, but don't clear its data buffers */
+static inline void
+fticket_reset(struct fuse_ticket *ftick)
+{
+ FUSE_ASSERT_MS_DONE(ftick);
+ FUSE_ASSERT_AW_DONE(ftick);
+
+ ftick->tk_ms_bufdata = NULL;
+ ftick->tk_ms_bufsize = 0;
+ ftick->tk_ms_type = FT_M_FIOV;
+
+ bzero(&ftick->tk_aw_ohead, sizeof(struct fuse_out_header));
+
+ ftick->tk_aw_errno = 0;
+ ftick->tk_aw_bufdata = NULL;
+ ftick->tk_aw_bufsize = 0;
+ ftick->tk_aw_type = FT_A_FIOV;
+
+ ftick->tk_flag = 0;
+}
+
static int
fticket_wait_answer(struct fuse_ticket *ftick)
{
- sigset_t tset;
- int err = 0;
- struct fuse_data *data;
+ struct thread *td = curthread;
+ sigset_t blockedset, oldset;
+ int err = 0, stops_deferred;
+ struct fuse_data *data = ftick->tk_data;
+ bool interrupted = false;
+ if (fsess_isimpl(ftick->tk_data->mp, FUSE_INTERRUPT) &&
+ data->dataflags & FSESS_INTR) {
+ SIGEMPTYSET(blockedset);
+ } else {
+ /* Block all signals except (implicitly) SIGKILL */
+ SIGFILLSET(blockedset);
+ }
+ stops_deferred = sigdeferstop(SIGDEFERSTOP_SILENT);
+ kern_sigprocmask(td, SIG_BLOCK, NULL, &oldset, 0);
+
fuse_lck_mtx_lock(ftick->tk_aw_mtx);
+retry:
if (fticket_answered(ftick)) {
goto out;
}
- data = ftick->tk_data;
if (fdata_get_dead(data)) {
err = ENOTCONN;
fticket_set_answered(ftick);
goto out;
}
- fuse_block_sigs(&tset);
+ kern_sigprocmask(td, SIG_BLOCK, &blockedset, NULL, 0);
err = msleep(ftick, &ftick->tk_aw_mtx, PCATCH, "fu_ans",
data->daemon_timeout * hz);
- fuse_restore_sigs(&tset);
- if (err == EAGAIN) { /* same as EWOULDBLOCK */
+ kern_sigprocmask(td, SIG_SETMASK, &oldset, NULL, 0);
+ if (err == EWOULDBLOCK) {
+ SDT_PROBE2(fusefs, , ipc, trace, 3,
+ "fticket_wait_answer: EWOULDBLOCK");
#ifdef XXXIP /* die conditionally */
if (!fdata_get_dead(data)) {
fdata_set_dead(data);
@@ -323,14 +477,64 @@
#endif
err = ETIMEDOUT;
fticket_set_answered(ftick);
+ } else if ((err == EINTR || err == ERESTART)) {
+ /*
+ * Whether we get EINTR or ERESTART depends on whether
+ * SA_RESTART was set by sigaction(2).
+ *
+ * Try to interrupt the operation and wait for an EINTR response
+ * to the original operation. If the file system does not
+ * support FUSE_INTERRUPT, then we'll just wait for it to
+ * complete like normal. If it does support FUSE_INTERRUPT,
+ * then it will either respond EINTR to the original operation,
+ * or EAGAIN to the interrupt.
+ */
+ sigset_t tmpset;
+
+ SDT_PROBE2(fusefs, , ipc, trace, 4,
+ "fticket_wait_answer: interrupt");
+ fuse_lck_mtx_unlock(ftick->tk_aw_mtx);
+ fuse_interrupt_send(ftick, err);
+
+ PROC_LOCK(td->td_proc);
+ mtx_lock(&td->td_proc->p_sigacts->ps_mtx);
+ tmpset = td->td_proc->p_siglist;
+ SIGSETOR(tmpset, td->td_siglist);
+ mtx_unlock(&td->td_proc->p_sigacts->ps_mtx);
+ PROC_UNLOCK(td->td_proc);
+
+ fuse_lck_mtx_lock(ftick->tk_aw_mtx);
+ if (!interrupted && !SIGISMEMBER(tmpset, SIGKILL)) {
+ /*
+ * Block all signals while we wait for an interrupt
+ * response. The protocol doesn't discriminate between
+ * different signals.
+ */
+ SIGFILLSET(blockedset);
+ interrupted = true;
+ goto retry;
+ } else {
+ /*
+ * Return immediately for fatal signals, or if this is
+ * the second interruption. We should only be
+ * interrupted twice if the thread is stopped, for
+ * example during sigexit.
+ */
+ }
+ } else if (err) {
+ SDT_PROBE2(fusefs, , ipc, trace, 6,
+ "fticket_wait_answer: other error");
+ } else {
+ SDT_PROBE2(fusefs, , ipc, trace, 7, "fticket_wait_answer: OK");
}
out:
if (!(err || fticket_answered(ftick))) {
- SDT_PROBE2(fuse, , ipc, trace, 1,
+ SDT_PROBE2(fusefs, , ipc, trace, 1,
"FUSE: requester was woken up but still no answer");
err = ENXIO;
}
fuse_lck_mtx_unlock(ftick->tk_aw_mtx);
+ sigallowstop(stops_deferred);
return err;
}
@@ -386,6 +590,8 @@
data->fdev = fdev;
mtx_init(&data->ms_mtx, "fuse message list mutex", NULL, MTX_DEF);
STAILQ_INIT(&data->ms_head);
+ data->ms_count = 0;
+ knlist_init_mtx(&data->ks_rsel.si_note, &data->ms_mtx);
mtx_init(&data->aw_mtx, "fuse answer list mutex", NULL, MTX_DEF);
TAILQ_INIT(&data->aw_head);
data->daemoncred = crhold(cred);
@@ -405,11 +611,12 @@
return;
/* Driving off stage all that stuff thrown at device... */
- mtx_destroy(&data->ms_mtx);
- mtx_destroy(&data->aw_mtx);
sx_destroy(&data->rename_lock);
-
crfree(data->daemoncred);
+ mtx_destroy(&data->aw_mtx);
+ knlist_delete(&data->ks_rsel.si_note, curthread, 0);
+ knlist_destroy(&data->ks_rsel.si_note);
+ mtx_destroy(&data->ms_mtx);
free(data, M_FUSEMSG);
}
@@ -478,8 +685,14 @@
fuse_lck_mtx_unlock(ftick->tk_data->aw_mtx);
}
+/*
+ * Insert a new upgoing ticket into the message queue
+ *
+ * If urgent is true, insert at the front of the queue. Otherwise, insert in
+ * FIFO order.
+ */
void
-fuse_insert_message(struct fuse_ticket *ftick)
+fuse_insert_message(struct fuse_ticket *ftick, bool urgent)
{
if (ftick->tk_flag & FT_DIRTY) {
panic("FUSE: ticket reused without being refreshed");
@@ -490,9 +703,13 @@
return;
}
fuse_lck_mtx_lock(ftick->tk_data->ms_mtx);
- fuse_ms_push(ftick);
+ if (urgent)
+ fuse_ms_push_head(ftick);
+ else
+ fuse_ms_push(ftick);
wakeup_one(ftick->tk_data);
selwakeuppri(&ftick->tk_data->ks_rsel, PZERO + 1);
+ KNOTE_LOCKED(&ftick->tk_data->ks_rsel.si_note, 0);
fuse_lck_mtx_unlock(ftick->tk_data->ms_mtx);
}
@@ -505,8 +722,21 @@
opcode = fticket_opcode(ftick);
switch (opcode) {
+ case FUSE_BMAP:
+ err = (blen == sizeof(struct fuse_bmap_out)) ? 0 : EINVAL;
+ break;
+
+ case FUSE_LINK:
case FUSE_LOOKUP:
- err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
+ case FUSE_MKDIR:
+ case FUSE_MKNOD:
+ case FUSE_SYMLINK:
+ if (fuse_libabi_geq(ftick->tk_data, 7, 9)) {
+ err = (blen == sizeof(struct fuse_entry_out)) ?
+ 0 : EINVAL;
+ } else {
+ err = (blen == FUSE_COMPAT_ENTRY_OUT_SIZE) ? 0 : EINVAL;
+ }
break;
case FUSE_FORGET:
@@ -514,29 +744,19 @@
break;
case FUSE_GETATTR:
- err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL;
- break;
-
case FUSE_SETATTR:
- err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL;
+ if (fuse_libabi_geq(ftick->tk_data, 7, 9)) {
+ err = (blen == sizeof(struct fuse_attr_out)) ?
+ 0 : EINVAL;
+ } else {
+ err = (blen == FUSE_COMPAT_ATTR_OUT_SIZE) ? 0 : EINVAL;
+ }
break;
case FUSE_READLINK:
err = (PAGE_SIZE >= blen) ? 0 : EINVAL;
break;
- case FUSE_SYMLINK:
- err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
- break;
-
- case FUSE_MKNOD:
- err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
- break;
-
- case FUSE_MKDIR:
- err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
- break;
-
case FUSE_UNLINK:
err = (blen == 0) ? 0 : EINVAL;
break;
@@ -549,10 +769,6 @@
err = (blen == 0) ? 0 : EINVAL;
break;
- case FUSE_LINK:
- err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
- break;
-
case FUSE_OPEN:
err = (blen == sizeof(struct fuse_open_out)) ? 0 : EINVAL;
break;
@@ -607,7 +823,9 @@
break;
case FUSE_INIT:
- if (blen == sizeof(struct fuse_init_out) || blen == 8) {
+ if (blen == sizeof(struct fuse_init_out) ||
+ blen == FUSE_COMPAT_INIT_OUT_SIZE ||
+ blen == FUSE_COMPAT_22_INIT_OUT_SIZE) {
err = 0;
} else {
err = EINVAL;
@@ -634,15 +852,15 @@
break;
case FUSE_GETLK:
- panic("FUSE: no response body format check for FUSE_GETLK");
+ err = (blen == sizeof(struct fuse_lk_out)) ? 0 : EINVAL;
break;
case FUSE_SETLK:
- panic("FUSE: no response body format check for FUSE_SETLK");
+ err = (blen == 0) ? 0 : EINVAL;
break;
case FUSE_SETLKW:
- panic("FUSE: no response body format check for FUSE_SETLKW");
+ err = (blen == 0) ? 0 : EINVAL;
break;
case FUSE_ACCESS:
@@ -650,8 +868,13 @@
break;
case FUSE_CREATE:
- err = (blen == sizeof(struct fuse_entry_out) +
- sizeof(struct fuse_open_out)) ? 0 : EINVAL;
+ if (fuse_libabi_geq(ftick->tk_data, 7, 9)) {
+ err = (blen == sizeof(struct fuse_entry_out) +
+ sizeof(struct fuse_open_out)) ? 0 : EINVAL;
+ } else {
+ err = (blen == FUSE_COMPAT_ENTRY_OUT_SIZE +
+ sizeof(struct fuse_open_out)) ? 0 : EINVAL;
+ }
break;
case FUSE_DESTROY:
@@ -677,7 +900,7 @@
ihead->pid = pid;
ihead->uid = cred->cr_uid;
- ihead->gid = cred->cr_rgid;
+ ihead->gid = cred->cr_groups[0];
}
/*
@@ -705,18 +928,38 @@
return err;
}
-void
-fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
+/*
+ * Reinitialize a dispatcher from a pid and node id, without resizing or
+ * clearing its data buffers
+ */
+static void
+fdisp_refresh_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred)
{
- struct fuse_data *data = fuse_get_mpdata(mp);
+ MPASS(fdip->tick);
+ MPASS2(sizeof(fdip->finh) + fdip->iosize <= fdip->tick->tk_ms_fiov.len,
+ "Must use fdisp_make_pid to increase the size of the fiov");
+ fticket_reset(fdip->tick);
+ FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh,
+ fdip->indata, fdip->iosize);
+
+ fuse_setup_ihead(fdip->finh, fdip->tick, nid, op, fdip->iosize, pid,
+ cred);
+}
+
+/* Initialize a dispatcher from a pid and node id */
+static void
+fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
+ struct fuse_data *data, uint64_t nid, pid_t pid, struct ucred *cred)
+{
if (fdip->tick) {
fticket_refresh(fdip->tick);
} else {
fdip->tick = fuse_ticket_fetch(data);
}
+ /* FUSE_DIMALLOC will bzero the fiovs when it enlarges them */
FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh,
fdip->indata, fdip->iosize);
@@ -727,22 +970,42 @@
fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp,
uint64_t nid, struct thread *td, struct ucred *cred)
{
+ struct fuse_data *data = fuse_get_mpdata(mp);
RECTIFY_TDCR(td, cred);
- return fdisp_make_pid(fdip, op, mp, nid, td->td_proc->p_pid, cred);
+ return fdisp_make_pid(fdip, op, data, nid, td->td_proc->p_pid, cred);
}
void
fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
struct vnode *vp, struct thread *td, struct ucred *cred)
{
+ struct mount *mp = vnode_mount(vp);
+ struct fuse_data *data = fuse_get_mpdata(mp);
+
RECTIFY_TDCR(td, cred);
- return fdisp_make_pid(fdip, op, vnode_mount(vp), VTOI(vp),
+ return fdisp_make_pid(fdip, op, data, VTOI(vp),
td->td_proc->p_pid, cred);
}
-SDT_PROBE_DEFINE2(fuse, , ipc, fdisp_wait_answ_error, "char*", "int");
+/* Refresh a fuse_dispatcher so it can be reused, but don't zero its data */
+void
+fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
+ struct vnode *vp, struct thread *td, struct ucred *cred)
+{
+ RECTIFY_TDCR(td, cred);
+ return fdisp_refresh_pid(fdip, op, vnode_mount(vp), VTOI(vp),
+ td->td_proc->p_pid, cred);
+}
+void
+fdisp_refresh(struct fuse_dispatcher *fdip)
+{
+ fticket_refresh(fdip->tick);
+}
+
+SDT_PROBE_DEFINE2(fusefs, , ipc, fdisp_wait_answ_error, "char*", "int");
+
int
fdisp_wait_answ(struct fuse_dispatcher *fdip)
{
@@ -750,7 +1013,7 @@
fdip->answ_stat = 0;
fuse_insert_callback(fdip->tick, fuse_standard_handler);
- fuse_insert_message(fdip->tick);
+ fuse_insert_message(fdip->tick, false);
if ((err = fticket_wait_answer(fdip->tick))) {
fuse_lck_mtx_lock(fdip->tick->tk_aw_mtx);
@@ -761,7 +1024,7 @@
* the standard handler has completed his job.
* So we drop the ticket and exit as usual.
*/
- SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
+ SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error,
"IPC: interrupted, already answered", err);
fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx);
goto out;
@@ -771,7 +1034,7 @@
* Then by setting the answered flag we get *him*
* to drop the ticket.
*/
- SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
+ SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error,
"IPC: interrupted, setting to answered", err);
fticket_set_answered(fdip->tick);
fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx);
@@ -779,14 +1042,22 @@
}
}
- if (fdip->tick->tk_aw_errno) {
- SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
+ if (fdip->tick->tk_aw_errno == ENOTCONN) {
+ /* The daemon died while we were waiting for a response */
+ err = ENOTCONN;
+ goto out;
+ } else if (fdip->tick->tk_aw_errno) {
+ /*
+ * There was some sort of communication error with the daemon
+ * that the client wouldn't understand.
+ */
+ SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error,
"IPC: explicit EIO-ing", fdip->tick->tk_aw_errno);
err = EIO;
goto out;
}
if ((err = fdip->tick->tk_aw_ohead.error)) {
- SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
+ SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error,
"IPC: setting status", fdip->tick->tk_aw_ohead.error);
/*
* This means a "proper" fuse syscall error.
@@ -815,10 +1086,13 @@
ticket_zone = uma_zcreate("fuse_ticket", sizeof(struct fuse_ticket),
fticket_ctor, fticket_dtor, fticket_init, fticket_fini,
UMA_ALIGN_PTR, 0);
+ fuse_ticket_count = counter_u64_alloc(M_WAITOK);
+ counter_u64_zero(fuse_ticket_count);
}
void
fuse_ipc_destroy(void)
{
+ counter_u64_free(fuse_ticket_count);
uma_zdestroy(ticket_zone);
}
Index: sys/fs/fuse/fuse_kernel.h
===================================================================
--- sys/fs/fuse/fuse_kernel.h
+++ sys/fs/fuse/fuse_kernel.h
@@ -1,6 +1,6 @@
/*--
* This file defines the kernel interface of FUSE
- * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ * Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
*
* This program can be distributed under the terms of the GNU GPL.
* See the file COPYING.
@@ -34,69 +34,134 @@
* $FreeBSD$
*/
-#ifndef linux
-#include <sys/types.h>
-#define __u64 uint64_t
-#define __u32 uint32_t
-#define __s32 int32_t
+/*
+ * This file defines the kernel interface of FUSE
+ *
+ * Protocol changelog:
+ *
+ * 7.9:
+ * - new fuse_getattr_in input argument of GETATTR
+ * - add lk_flags in fuse_lk_in
+ * - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in
+ * - add blksize field to fuse_attr
+ * - add file flags field to fuse_read_in and fuse_write_in
+ *
+ * 7.10
+ * - add nonseekable open flag
+ *
+ * 7.11
+ * - add IOCTL message
+ * - add unsolicited notification support
+ *
+ * 7.12
+ * - add umask flag to input argument of open, mknod and mkdir
+ * - add notification messages for invalidation of inodes and
+ * directory entries
+ *
+ * 7.13
+ * - make max number of background requests and congestion threshold
+ * tunables
+ *
+ * 7.14
+ * - add splice support to fuse device
+ *
+ * 7.15
+ * - add store notify
+ * - add retrieve notify
+ *
+ * 7.16
+ * - add BATCH_FORGET request
+ * - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct
+ * fuse_ioctl_iovec' instead of ambiguous 'struct iovec'
+ * - add FUSE_IOCTL_32BIT flag
+ *
+ * 7.17
+ * - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK
+ *
+ * 7.18
+ * - add FUSE_IOCTL_DIR flag
+ * - add FUSE_NOTIFY_DELETE
+ *
+ * 7.19
+ * - add FUSE_FALLOCATE
+ *
+ * 7.20
+ * - add FUSE_AUTO_INVAL_DATA
+ * 7.21
+ * - add FUSE_READDIRPLUS
+ * - send the requested events in POLL request
+ *
+ * 7.22
+ * - add FUSE_ASYNC_DIO
+ *
+ * 7.23
+ * - add FUSE_WRITEBACK_CACHE
+ * - add time_gran to fuse_init_out
+ * - add reserved space to fuse_init_out
+ * - add FATTR_CTIME
+ * - add ctime and ctimensec to fuse_setattr_in
+ * - add FUSE_RENAME2 request
+ * - add FUSE_NO_OPEN_SUPPORT flag
+ */
+
+#ifndef _FUSE_FUSE_KERNEL_H
+#define _FUSE_FUSE_KERNEL_H
+
+#ifdef __linux__
+#include <linux/types.h>
#else
-#include <asm/types.h>
-#include <linux/major.h>
+#include <sys/types.h>
#endif
/** Version number of this interface */
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 8
+#define FUSE_KERNEL_MINOR_VERSION 23
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
-/** The major number of the fuse character device */
-#define FUSE_MAJOR MISC_MAJOR
-
-/** The minor number of the fuse character device */
-#define FUSE_MINOR 229
-
/* Make sure all structures are padded to 64bit boundary, so 32bit
userspace works under 64bit kernels */
struct fuse_attr {
- __u64 ino;
- __u64 size;
- __u64 blocks;
- __u64 atime;
- __u64 mtime;
- __u64 ctime;
- __u32 atimensec;
- __u32 mtimensec;
- __u32 ctimensec;
- __u32 mode;
- __u32 nlink;
- __u32 uid;
- __u32 gid;
- __u32 rdev;
+ uint64_t ino;
+ uint64_t size;
+ uint64_t blocks;
+ uint64_t atime;
+ uint64_t mtime;
+ uint64_t ctime;
+ uint32_t atimensec;
+ uint32_t mtimensec;
+ uint32_t ctimensec;
+ uint32_t mode;
+ uint32_t nlink;
+ uint32_t uid;
+ uint32_t gid;
+ uint32_t rdev;
+ uint32_t blksize;
+ uint32_t padding;
};
struct fuse_kstatfs {
- __u64 blocks;
- __u64 bfree;
- __u64 bavail;
- __u64 files;
- __u64 ffree;
- __u32 bsize;
- __u32 namelen;
- __u32 frsize;
- __u32 padding;
- __u32 spare[6];
+ uint64_t blocks;
+ uint64_t bfree;
+ uint64_t bavail;
+ uint64_t files;
+ uint64_t ffree;
+ uint32_t bsize;
+ uint32_t namelen;
+ uint32_t frsize;
+ uint32_t padding;
+ uint32_t spare[6];
};
struct fuse_file_lock {
- __u64 start;
- __u64 end;
- __u32 type;
- __u32 pid; /* tgid */
+ uint64_t start;
+ uint64_t end;
+ uint32_t type;
+ uint32_t pid; /* tgid */
};
/**
@@ -109,27 +174,128 @@
#define FATTR_ATIME (1 << 4)
#define FATTR_MTIME (1 << 5)
#define FATTR_FH (1 << 6)
+#define FATTR_ATIME_NOW (1 << 7)
+#define FATTR_MTIME_NOW (1 << 8)
+#define FATTR_LOCKOWNER (1 << 9)
+#define FATTR_CTIME (1 << 10)
/**
* Flags returned by the OPEN request
*
* FOPEN_DIRECT_IO: bypass page cache for this open file
* FOPEN_KEEP_CACHE: don't invalidate the data cache on open
+ * FOPEN_NONSEEKABLE: the file is not seekable
*/
#define FOPEN_DIRECT_IO (1 << 0)
#define FOPEN_KEEP_CACHE (1 << 1)
+#define FOPEN_NONSEEKABLE (1 << 2)
/**
* INIT request/reply flags
+ *
+ * FUSE_ASYNC_READ: asynchronous read requests
+ * FUSE_POSIX_LOCKS: remote locking for POSIX file locks
+ * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported)
+ * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem
+ * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".."
+ * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB
+ * FUSE_DONT_MASK: don't apply umask to file mode on create operations
+ * FUSE_SPLICE_WRITE: kernel supports splice write on the device
+ * FUSE_SPLICE_MOVE: kernel supports splice move on the device
+ * FUSE_SPLICE_READ: kernel supports splice read on the device
+ * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks
+ * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories
+ * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages
+ * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one)
+ * FUSE_READDIRPLUS_AUTO: adaptive readdirplus
+ * FUSE_ASYNC_DIO: asynchronous direct I/O submission
+ * FUSE_WRITEBACK_CACHE: use writeback cache for buffered writes
+ * FUSE_NO_OPEN_SUPPORT: kernel supports zero-message opens
*/
#define FUSE_ASYNC_READ (1 << 0)
#define FUSE_POSIX_LOCKS (1 << 1)
+#define FUSE_FILE_OPS (1 << 2)
+#define FUSE_ATOMIC_O_TRUNC (1 << 3)
+#define FUSE_EXPORT_SUPPORT (1 << 4)
+#define FUSE_BIG_WRITES (1 << 5)
+#define FUSE_DONT_MASK (1 << 6)
+#define FUSE_SPLICE_WRITE (1 << 7)
+#define FUSE_SPLICE_MOVE (1 << 8)
+#define FUSE_SPLICE_READ (1 << 9)
+#define FUSE_FLOCK_LOCKS (1 << 10)
+#define FUSE_HAS_IOCTL_DIR (1 << 11)
+#define FUSE_AUTO_INVAL_DATA (1 << 12)
+#define FUSE_DO_READDIRPLUS (1 << 13)
+#define FUSE_READDIRPLUS_AUTO (1 << 14)
+#define FUSE_ASYNC_DIO (1 << 15)
+#define FUSE_WRITEBACK_CACHE (1 << 16)
+#define FUSE_NO_OPEN_SUPPORT (1 << 17)
+#ifdef linux
/**
+ * CUSE INIT request/reply flags
+ *
+ * CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl
+ */
+#define CUSE_UNRESTRICTED_IOCTL (1 << 0)
+#endif /* linux */
+
+/**
* Release flags
*/
#define FUSE_RELEASE_FLUSH (1 << 0)
+#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1)
+/**
+ * Getattr flags
+ */
+#define FUSE_GETATTR_FH (1 << 0)
+
+/**
+ * Lock flags
+ */
+#define FUSE_LK_FLOCK (1 << 0)
+
+/**
+ * WRITE flags
+ *
+ * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed
+ * FUSE_WRITE_LOCKOWNER: lock_owner field is valid
+ */
+#define FUSE_WRITE_CACHE (1 << 0)
+#define FUSE_WRITE_LOCKOWNER (1 << 1)
+
+/**
+ * Read flags
+ */
+#define FUSE_READ_LOCKOWNER (1 << 1)
+
+/**
+ * Ioctl flags
+ *
+ * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
+ * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
+ * FUSE_IOCTL_RETRY: retry with new iovecs
+ * FUSE_IOCTL_32BIT: 32bit ioctl
+ * FUSE_IOCTL_DIR: is a directory
+ *
+ * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
+ */
+#define FUSE_IOCTL_COMPAT (1 << 0)
+#define FUSE_IOCTL_UNRESTRICTED (1 << 1)
+#define FUSE_IOCTL_RETRY (1 << 2)
+#define FUSE_IOCTL_32BIT (1 << 3)
+#define FUSE_IOCTL_DIR (1 << 4)
+
+#define FUSE_IOCTL_MAX_IOV 256
+
+/**
+ * Poll flags
+ *
+ * FUSE_POLL_SCHEDULE_NOTIFY: request poll notify
+ */
+#define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0)
+
enum fuse_opcode {
FUSE_LOOKUP = 1,
FUSE_FORGET = 2, /* no reply */
@@ -167,107 +333,179 @@
FUSE_INTERRUPT = 36,
FUSE_BMAP = 37,
FUSE_DESTROY = 38,
+ FUSE_IOCTL = 39,
+ FUSE_POLL = 40,
+ FUSE_NOTIFY_REPLY = 41,
+ FUSE_BATCH_FORGET = 42,
+ FUSE_FALLOCATE = 43,
+ FUSE_READDIRPLUS = 44,
+ FUSE_RENAME2 = 45,
+
+#ifdef linux
+ /* CUSE specific operations */
+ CUSE_INIT = 4096,
+#endif /* linux */
};
+enum fuse_notify_code {
+ FUSE_NOTIFY_POLL = 1,
+ FUSE_NOTIFY_INVAL_INODE = 2,
+ FUSE_NOTIFY_INVAL_ENTRY = 3,
+ FUSE_NOTIFY_STORE = 4,
+ FUSE_NOTIFY_RETRIEVE = 5,
+ FUSE_NOTIFY_DELETE = 6,
+ FUSE_NOTIFY_CODE_MAX,
+};
+
/* The read buffer is required to be at least 8k, but may be much larger */
#define FUSE_MIN_READ_BUFFER 8192
+#define FUSE_COMPAT_ENTRY_OUT_SIZE 120
+
struct fuse_entry_out {
- __u64 nodeid; /* Inode ID */
- __u64 generation; /* Inode generation: nodeid:gen must
- be unique for the fs's lifetime */
- __u64 entry_valid; /* Cache timeout for the name */
- __u64 attr_valid; /* Cache timeout for the attributes */
- __u32 entry_valid_nsec;
- __u32 attr_valid_nsec;
+ uint64_t nodeid; /* Inode ID */
+ uint64_t generation; /* Inode generation: nodeid:gen must
+ be unique for the fs's lifetime */
+ uint64_t entry_valid; /* Cache timeout for the name */
+ uint64_t attr_valid; /* Cache timeout for the attributes */
+ uint32_t entry_valid_nsec;
+ uint32_t attr_valid_nsec;
struct fuse_attr attr;
};
struct fuse_forget_in {
- __u64 nlookup;
+ uint64_t nlookup;
};
+struct fuse_forget_one {
+ uint64_t nodeid;
+ uint64_t nlookup;
+};
+
+struct fuse_batch_forget_in {
+ uint32_t count;
+ uint32_t dummy;
+};
+
+struct fuse_getattr_in {
+ uint32_t getattr_flags;
+ uint32_t dummy;
+ uint64_t fh;
+};
+
+#define FUSE_COMPAT_ATTR_OUT_SIZE 96
+
struct fuse_attr_out {
- __u64 attr_valid; /* Cache timeout for the attributes */
- __u32 attr_valid_nsec;
- __u32 dummy;
+ uint64_t attr_valid; /* Cache timeout for the attributes */
+ uint32_t attr_valid_nsec;
+ uint32_t dummy;
struct fuse_attr attr;
};
+#define FUSE_COMPAT_MKNOD_IN_SIZE 8
+
+struct fuse_mknod_in {
+ uint32_t mode;
+ uint32_t rdev;
+ uint32_t umask;
+ uint32_t padding;
+};
+
struct fuse_mkdir_in {
- __u32 mode;
- __u32 padding;
+ uint32_t mode;
+ uint32_t umask;
};
struct fuse_rename_in {
- __u64 newdir;
+ uint64_t newdir;
};
+struct fuse_rename2_in {
+ uint64_t newdir;
+ uint32_t flags;
+ uint32_t padding;
+};
+
struct fuse_link_in {
- __u64 oldnodeid;
+ uint64_t oldnodeid;
};
struct fuse_setattr_in {
- __u32 valid;
- __u32 padding;
- __u64 fh;
- __u64 size;
- __u64 unused1;
- __u64 atime;
- __u64 mtime;
- __u64 unused2;
- __u32 atimensec;
- __u32 mtimensec;
- __u32 unused3;
- __u32 mode;
- __u32 unused4;
- __u32 uid;
- __u32 gid;
- __u32 unused5;
+ uint32_t valid;
+ uint32_t padding;
+ uint64_t fh;
+ uint64_t size;
+ uint64_t lock_owner;
+ uint64_t atime;
+ uint64_t mtime;
+ uint64_t ctime;
+ uint32_t atimensec;
+ uint32_t mtimensec;
+ uint32_t ctimensec;
+ uint32_t mode;
+ uint32_t unused4;
+ uint32_t uid;
+ uint32_t gid;
+ uint32_t unused5;
};
struct fuse_open_in {
- __u32 flags;
- __u32 mode;
+ uint32_t flags;
+ uint32_t unused;
};
+struct fuse_create_in {
+ uint32_t flags;
+ uint32_t mode;
+ uint32_t umask;
+ uint32_t padding;
+};
+
struct fuse_open_out {
- __u64 fh;
- __u32 open_flags;
- __u32 padding;
+ uint64_t fh;
+ uint32_t open_flags;
+ uint32_t padding;
};
struct fuse_release_in {
- __u64 fh;
- __u32 flags;
- __u32 release_flags;
- __u64 lock_owner;
+ uint64_t fh;
+ uint32_t flags;
+ uint32_t release_flags;
+ uint64_t lock_owner;
};
struct fuse_flush_in {
- __u64 fh;
- __u32 unused;
- __u32 padding;
- __u64 lock_owner;
+ uint64_t fh;
+ uint32_t unused;
+ uint32_t padding;
+ uint64_t lock_owner;
};
struct fuse_read_in {
- __u64 fh;
- __u64 offset;
- __u32 size;
- __u32 padding;
+ uint64_t fh;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t read_flags;
+ uint64_t lock_owner;
+ uint32_t flags;
+ uint32_t padding;
};
+#define FUSE_COMPAT_WRITE_IN_SIZE 24
+
struct fuse_write_in {
- __u64 fh;
- __u64 offset;
- __u32 size;
- __u32 write_flags;
+ uint64_t fh;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t write_flags;
+ uint64_t lock_owner;
+ uint32_t flags;
+ uint32_t padding;
};
struct fuse_write_out {
- __u32 size;
- __u32 padding;
+ uint32_t size;
+ uint32_t padding;
};
#define FUSE_COMPAT_STATFS_SIZE 48
@@ -277,40 +515,42 @@
};
struct fuse_fsync_in {
- __u64 fh;
- __u32 fsync_flags;
- __u32 padding;
+ uint64_t fh;
+ uint32_t fsync_flags;
+ uint32_t padding;
};
+struct fuse_setxattr_in {
+ uint32_t size;
+ uint32_t flags;
+};
+
struct fuse_listxattr_in {
- __u32 size;
- __u32 flags;
+ uint32_t size;
+ uint32_t padding;
};
struct fuse_listxattr_out {
- __u32 size;
- __u32 flags;
+ uint32_t size;
+ uint32_t padding;
};
struct fuse_getxattr_in {
- __u32 size;
- __u32 padding;
+ uint32_t size;
+ uint32_t padding;
};
struct fuse_getxattr_out {
- __u32 size;
- __u32 padding;
+ uint32_t size;
+ uint32_t padding;
};
-struct fuse_setxattr_in {
- __u32 size;
- __u32 flags;
-};
-
struct fuse_lk_in {
- __u64 fh;
- __u64 owner;
+ uint64_t fh;
+ uint64_t owner;
struct fuse_file_lock lk;
+ uint32_t lk_flags;
+ uint32_t padding;
};
struct fuse_lk_out {
@@ -318,66 +558,197 @@
};
struct fuse_access_in {
- __u32 mask;
- __u32 padding;
+ uint32_t mask;
+ uint32_t padding;
};
struct fuse_init_in {
- __u32 major;
- __u32 minor;
- __u32 max_readahead;
- __u32 flags;
+ uint32_t major;
+ uint32_t minor;
+ uint32_t max_readahead;
+ uint32_t flags;
};
+#define FUSE_COMPAT_INIT_OUT_SIZE 8
+#define FUSE_COMPAT_22_INIT_OUT_SIZE 24
+
struct fuse_init_out {
- __u32 major;
- __u32 minor;
- __u32 max_readahead;
- __u32 flags;
- __u32 unused;
- __u32 max_write;
+ uint32_t major;
+ uint32_t minor;
+ uint32_t max_readahead;
+ uint32_t flags;
+ uint16_t max_background;
+ uint16_t congestion_threshold;
+ uint32_t max_write;
+ uint32_t time_gran;
+ uint32_t unused[9];
};
+#ifdef linux
+#define CUSE_INIT_INFO_MAX 4096
+
+struct cuse_init_in {
+ uint32_t major;
+ uint32_t minor;
+ uint32_t unused;
+ uint32_t flags;
+};
+
+struct cuse_init_out {
+ uint32_t major;
+ uint32_t minor;
+ uint32_t unused;
+ uint32_t flags;
+ uint32_t max_read;
+ uint32_t max_write;
+ uint32_t dev_major; /* chardev major */
+ uint32_t dev_minor; /* chardev minor */
+ uint32_t spare[10];
+};
+#endif /* linux */
+
struct fuse_interrupt_in {
- __u64 unique;
+ uint64_t unique;
};
struct fuse_bmap_in {
- __u64 block;
- __u32 blocksize;
- __u32 padding;
+ uint64_t block;
+ uint32_t blocksize;
+ uint32_t padding;
};
struct fuse_bmap_out {
- __u64 block;
+ uint64_t block;
};
+struct fuse_ioctl_in {
+ uint64_t fh;
+ uint32_t flags;
+ uint32_t cmd;
+ uint64_t arg;
+ uint32_t in_size;
+ uint32_t out_size;
+};
+
+struct fuse_ioctl_iovec {
+ uint64_t base;
+ uint64_t len;
+};
+
+struct fuse_ioctl_out {
+ int32_t result;
+ uint32_t flags;
+ uint32_t in_iovs;
+ uint32_t out_iovs;
+};
+
+struct fuse_poll_in {
+ uint64_t fh;
+ uint64_t kh;
+ uint32_t flags;
+ uint32_t events;
+};
+
+struct fuse_poll_out {
+ uint32_t revents;
+ uint32_t padding;
+};
+
+struct fuse_notify_poll_wakeup_out {
+ uint64_t kh;
+};
+
+struct fuse_fallocate_in {
+ uint64_t fh;
+ uint64_t offset;
+ uint64_t length;
+ uint32_t mode;
+ uint32_t padding;
+};
+
struct fuse_in_header {
- __u32 len;
- __u32 opcode;
- __u64 unique;
- __u64 nodeid;
- __u32 uid;
- __u32 gid;
- __u32 pid;
- __u32 padding;
+ uint32_t len;
+ uint32_t opcode;
+ uint64_t unique;
+ uint64_t nodeid;
+ uint32_t uid;
+ uint32_t gid;
+ uint32_t pid;
+ uint32_t padding;
};
struct fuse_out_header {
- __u32 len;
- __s32 error;
- __u64 unique;
+ uint32_t len;
+ int32_t error;
+ uint64_t unique;
};
struct fuse_dirent {
- __u64 ino;
- __u64 off;
- __u32 namelen;
- __u32 type;
- char name[0];
+ uint64_t ino;
+ uint64_t off;
+ uint32_t namelen;
+ uint32_t type;
+ char name[];
};
#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
-#define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1))
+#define FUSE_DIRENT_ALIGN(x) \
+ (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1))
#define FUSE_DIRENT_SIZE(d) \
FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
+
+struct fuse_direntplus {
+ struct fuse_entry_out entry_out;
+ struct fuse_dirent dirent;
+};
+
+#define FUSE_NAME_OFFSET_DIRENTPLUS \
+ offsetof(struct fuse_direntplus, dirent.name)
+#define FUSE_DIRENTPLUS_SIZE(d) \
+ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen)
+
+struct fuse_notify_inval_inode_out {
+ uint64_t ino;
+ int64_t off;
+ int64_t len;
+};
+
+struct fuse_notify_inval_entry_out {
+ uint64_t parent;
+ uint32_t namelen;
+ uint32_t padding;
+};
+
+struct fuse_notify_delete_out {
+ uint64_t parent;
+ uint64_t child;
+ uint32_t namelen;
+ uint32_t padding;
+};
+
+struct fuse_notify_store_out {
+ uint64_t nodeid;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t padding;
+};
+
+struct fuse_notify_retrieve_out {
+ uint64_t notify_unique;
+ uint64_t nodeid;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t padding;
+};
+
+/* Matches the size of fuse_write_in */
+struct fuse_notify_retrieve_in {
+ uint64_t dummy1;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t dummy2;
+ uint64_t dummy3;
+ uint64_t dummy4;
+};
+
+#endif /* _FUSE_FUSE_KERNEL_H */
Index: sys/fs/fuse/fuse_main.c
===================================================================
--- sys/fs/fuse/fuse_main.c
+++ sys/fs/fuse/fuse_main.c
@@ -33,6 +33,11 @@
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
*
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -77,6 +82,10 @@
#include <sys/sysctl.h>
#include "fuse.h"
+#include "fuse_file.h"
+#include "fuse_ipc.h"
+#include "fuse_internal.h"
+#include "fuse_node.h"
static void fuse_bringdown(eventhandler_tag eh_tag);
static int fuse_loader(struct module *m, int what, void *arg);
@@ -85,7 +94,7 @@
extern struct vfsops fuse_vfsops;
extern struct cdevsw fuse_cdevsw;
-extern struct vop_vector fuse_vnops;
+extern struct vop_vector fuse_fifonops;
extern uma_zone_t fuse_pbuf_zone;
static struct vfsconf fuse_vfsconf = {
@@ -96,11 +105,13 @@
.vfc_flags = VFCF_JAIL | VFCF_SYNTHETIC
};
+SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW, 0, "FUSE tunables");
+SYSCTL_NODE(_vfs_fusefs, OID_AUTO, stats, CTLFLAG_RW, 0, "FUSE statistics");
SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_major, CTLFLAG_RD,
SYSCTL_NULL_INT_PTR, FUSE_KERNEL_VERSION, "FUSE kernel abi major version");
SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_minor, CTLFLAG_RD,
SYSCTL_NULL_INT_PTR, FUSE_KERNEL_MINOR_VERSION, "FUSE kernel abi minor version");
-SDT_PROVIDER_DEFINE(fuse);
+SDT_PROVIDER_DEFINE(fusefs);
/******************************
*
@@ -111,7 +122,9 @@
static void
fuse_bringdown(eventhandler_tag eh_tag)
{
-
+ fuse_node_destroy();
+ fuse_internal_destroy();
+ fuse_file_destroy();
fuse_ipc_destroy();
fuse_device_destroy();
mtx_destroy(&fuse_mtx);
@@ -132,16 +145,14 @@
return (err);
}
fuse_ipc_init();
+ fuse_file_init();
+ fuse_internal_init();
+ fuse_node_init();
fuse_pbuf_zone = pbuf_zsecond_create("fusepbuf", nswbuf / 2);
/* vfs_modevent ignores its first arg */
if ((err = vfs_modevent(NULL, what, &fuse_vfsconf)))
fuse_bringdown(eh_tag);
- else
- printf("fuse-freebsd: version %s, FUSE ABI %d.%d\n",
- FUSE_FREEBSD_VERSION,
- FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
-
break;
case MOD_UNLOAD:
if ((err = vfs_modevent(NULL, what, &fuse_vfsconf)))
Index: sys/fs/fuse/fuse_node.h
===================================================================
--- sys/fs/fuse/fuse_node.h
+++ sys/fs/fuse/fuse_node.h
@@ -32,6 +32,11 @@
*
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -60,60 +65,121 @@
#ifndef _FUSE_NODE_H_
#define _FUSE_NODE_H_
+#include <sys/fnv_hash.h>
#include <sys/types.h>
#include <sys/mutex.h>
#include "fuse_file.h"
-#define FN_REVOKED 0x00000020
-#define FN_FLUSHINPROG 0x00000040
-#define FN_FLUSHWANT 0x00000080
-#define FN_SIZECHANGE 0x00000100
-#define FN_DIRECTIO 0x00000200
+#define FN_REVOKED 0x00000020
+#define FN_FLUSHINPROG 0x00000040
+#define FN_FLUSHWANT 0x00000080
+/*
+ * Indicates that the file's size is dirty; the kernel has changed it but not
+ * yet send the change to the daemon. When this bit is set, the
+ * cache_attrs.va_size field does not time out.
+ */
+#define FN_SIZECHANGE 0x00000100
+#define FN_DIRECTIO 0x00000200
+/* Indicates that parent_nid is valid */
+#define FN_PARENT_NID 0x00000400
+/*
+ * Indicates that the file's cached timestamps are dirty. They will be flushed
+ * during the next SETATTR or WRITE. Until then, the cached fields will not
+ * time out.
+ */
+#define FN_MTIMECHANGE 0x00000800
+#define FN_CTIMECHANGE 0x00001000
+
struct fuse_vnode_data {
/** self **/
uint64_t nid;
+ uint64_t generation;
/** parent **/
- /* XXXIP very likely to be stale, it's not updated in rename() */
uint64_t parent_nid;
/** I/O **/
- struct fuse_filehandle fufh[FUFH_MAXTYPE];
+ /* List of file handles for all of the vnode's open file descriptors */
+ LIST_HEAD(, fuse_filehandle) handles;
/** flags **/
uint32_t flag;
/** meta **/
- bool valid_attr_cache;
+ /* The monotonic time after which the attr cache is invalid */
+ struct bintime attr_cache_timeout;
+ /*
+ * Monotonic time after which the entry is invalid. Used for lookups
+ * by nodeid instead of pathname.
+ */
+ struct bintime entry_cache_timeout;
struct vattr cached_attrs;
- off_t filesize;
uint64_t nlookup;
enum vtype vtype;
};
+/*
+ * This overlays the fid structure (see mount.h). Mostly the same as the types
+ * used by UFS and ext2.
+ */
+struct fuse_fid {
+ uint16_t len; /* Length of structure. */
+ uint16_t pad; /* Force 32-bit alignment. */
+ uint32_t gen; /* Generation number. */
+ uint64_t nid; /* FUSE node id. */
+};
+
#define VTOFUD(vp) \
((struct fuse_vnode_data *)((vp)->v_data))
#define VTOI(vp) (VTOFUD(vp)->nid)
-#define VTOVA(vp) \
- (VTOFUD(vp)->valid_attr_cache ? \
- &(VTOFUD(vp)->cached_attrs) : NULL)
+static inline struct vattr*
+VTOVA(struct vnode *vp)
+{
+ struct bintime now;
+
+ getbinuptime(&now);
+ if (bintime_cmp(&(VTOFUD(vp)->attr_cache_timeout), &now, >))
+ return &(VTOFUD(vp)->cached_attrs);
+ else
+ return NULL;
+}
+
+static inline void
+fuse_vnode_clear_attr_cache(struct vnode *vp)
+{
+ bintime_clear(&VTOFUD(vp)->attr_cache_timeout);
+}
+
+static uint32_t inline
+fuse_vnode_hash(uint64_t id)
+{
+ return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT));
+}
+
#define VTOILLU(vp) ((uint64_t)(VTOFUD(vp) ? VTOI(vp) : 0))
#define FUSE_NULL_ID 0
+extern struct vop_vector fuse_fifoops;
extern struct vop_vector fuse_vnops;
+int fuse_vnode_cmp(struct vnode *vp, void *nidp);
+
static inline void
fuse_vnode_setparent(struct vnode *vp, struct vnode *dvp)
{
if (dvp != NULL && vp->v_type == VDIR) {
MPASS(dvp->v_type == VDIR);
VTOFUD(vp)->parent_nid = VTOI(dvp);
+ VTOFUD(vp)->flag |= FN_PARENT_NID;
}
}
+int fuse_vnode_size(struct vnode *vp, off_t *filesize, struct ucred *cred,
+ struct thread *td);
+
void fuse_vnode_destroy(struct vnode *vp);
int fuse_vnode_get(struct mount *mp, struct fuse_entry_out *feo,
@@ -123,10 +189,14 @@
void fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags,
struct thread *td);
-void fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred);
+int fuse_vnode_savesize(struct vnode *vp, struct ucred *cred, pid_t pid);
-int fuse_vnode_savesize(struct vnode *vp, struct ucred *cred);
-
int fuse_vnode_setsize(struct vnode *vp, off_t newsize);
+void fuse_vnode_undirty_cached_timestamps(struct vnode *vp);
+
+void fuse_vnode_update(struct vnode *vp, int flags);
+
+void fuse_node_init(void);
+void fuse_node_destroy(void);
#endif /* _FUSE_NODE_H_ */
Index: sys/fs/fuse/fuse_node.c
===================================================================
--- sys/fs/fuse/fuse_node.c
+++ sys/fs/fuse/fuse_node.c
@@ -33,6 +33,11 @@
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
*
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -59,8 +64,9 @@
__FBSDID("$FreeBSD$");
#include <sys/types.h>
-#include <sys/module.h>
#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/module.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/kernel.h>
@@ -77,8 +83,8 @@
#include <sys/mount.h>
#include <sys/sysctl.h>
#include <sys/fcntl.h>
-#include <sys/fnv_hash.h>
#include <sys/priv.h>
+#include <sys/buf.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
@@ -89,65 +95,40 @@
#include "fuse_io.h"
#include "fuse_ipc.h"
-SDT_PROVIDER_DECLARE(fuse);
+SDT_PROVIDER_DECLARE(fusefs);
/*
* Fuse trace probe:
* arg0: verbosity. Higher numbers give more verbose messages
* arg1: Textual message
*/
-SDT_PROBE_DEFINE2(fuse, , node, trace, "int", "char*");
+SDT_PROBE_DEFINE2(fusefs, , node, trace, "int", "char*");
MALLOC_DEFINE(M_FUSEVN, "fuse_vnode", "fuse vnode private data");
static int sysctl_fuse_cache_mode(SYSCTL_HANDLER_ARGS);
-static int fuse_node_count = 0;
+static counter_u64_t fuse_node_count;
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, node_count, CTLFLAG_RD,
- &fuse_node_count, 0, "Count of FUSE vnodes");
+SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, node_count, CTLFLAG_RD,
+ &fuse_node_count, "Count of FUSE vnodes");
int fuse_data_cache_mode = FUSE_CACHE_WT;
+/*
+ * DEPRECATED
+ * This sysctl is no longer needed as of fuse protocol 7.23. Individual
+ * servers can select the cache behavior they need for each mountpoint:
+ * - writethrough: the default
+ * - writeback: set FUSE_WRITEBACK_CACHE in fuse_init_out.flags
+ * - uncached: set FOPEN_DIRECT_IO for every file
+ * The sysctl is retained primarily for use by jails supporting older FUSE
+ * protocols. It may be removed entirely once FreeBSD 11.3 and 12.0 are EOL.
+ */
SYSCTL_PROC(_vfs_fusefs, OID_AUTO, data_cache_mode, CTLTYPE_INT|CTLFLAG_RW,
&fuse_data_cache_mode, 0, sysctl_fuse_cache_mode, "I",
"Zero: disable caching of FUSE file data; One: write-through caching "
"(default); Two: write-back caching (generally unsafe)");
-int fuse_data_cache_invalidate = 0;
-
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, data_cache_invalidate, CTLFLAG_RW,
- &fuse_data_cache_invalidate, 0,
- "If non-zero, discard cached clean file data when there are no active file"
- " users");
-
-int fuse_mmap_enable = 1;
-
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, mmap_enable, CTLFLAG_RW,
- &fuse_mmap_enable, 0,
- "If non-zero, and data_cache_mode is also non-zero, enable mmap(2) of "
- "FUSE files");
-
-int fuse_refresh_size = 0;
-
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, refresh_size, CTLFLAG_RW,
- &fuse_refresh_size, 0,
- "If non-zero, and no dirty file extension data is buffered, fetch file "
- "size before write operations");
-
-int fuse_sync_resize = 1;
-
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, sync_resize, CTLFLAG_RW,
- &fuse_sync_resize, 0,
- "If a cached write extended a file, inform FUSE filesystem of the changed"
- "size immediately subsequent to the issued writes");
-
-int fuse_fix_broken_io = 0;
-
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, fix_broken_io, CTLFLAG_RW,
- &fuse_fix_broken_io, 0,
- "If non-zero, print a diagnostic warning if a userspace filesystem returns"
- " EIO on reads of recently extended portions of files");
-
static int
sysctl_fuse_cache_mode(SYSCTL_HANDLER_ARGS)
{
@@ -174,9 +155,8 @@
fuse_vnode_init(struct vnode *vp, struct fuse_vnode_data *fvdat,
uint64_t nodeid, enum vtype vtyp)
{
- int i;
-
fvdat->nid = nodeid;
+ LIST_INIT(&fvdat->handles);
vattr_null(&fvdat->cached_attrs);
if (nodeid == FUSE_ROOT_ID) {
vp->v_vflag |= VV_ROOT;
@@ -184,10 +164,7 @@
vp->v_type = vtyp;
vp->v_data = fvdat;
- for (i = 0; i < FUFH_MAXTYPE; i++)
- fvdat->fufh[i].fh_type = FUFH_INVALID;
-
- atomic_add_acq_int(&fuse_node_count, 1);
+ counter_u64_add(fuse_node_count, 1);
}
void
@@ -196,23 +173,21 @@
struct fuse_vnode_data *fvdat = vp->v_data;
vp->v_data = NULL;
+ KASSERT(LIST_EMPTY(&fvdat->handles),
+ ("Destroying fuse vnode with open files!"));
free(fvdat, M_FUSEVN);
- atomic_subtract_acq_int(&fuse_node_count, 1);
+ counter_u64_add(fuse_node_count, -1);
}
-static int
+int
fuse_vnode_cmp(struct vnode *vp, void *nidp)
{
return (VTOI(vp) != *((uint64_t *)nidp));
}
-static uint32_t inline
-fuse_vnode_hash(uint64_t id)
-{
- return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT));
-}
-
+SDT_PROBE_DEFINE3(fusefs, , node, stale_vnode, "struct vnode*", "enum vtype",
+ "uint64_t");
static int
fuse_vnode_alloc(struct mount *mp,
struct thread *td,
@@ -220,10 +195,12 @@
enum vtype vtyp,
struct vnode **vpp)
{
+ struct fuse_data *data;
struct fuse_vnode_data *fvdat;
struct vnode *vp2;
int err = 0;
+ data = fuse_get_mpdata(mp);
if (vtyp == VNON) {
return EINVAL;
}
@@ -234,12 +211,34 @@
return (err);
if (*vpp) {
- MPASS((*vpp)->v_type == vtyp && (*vpp)->v_data != NULL);
- SDT_PROBE2(fuse, , node, trace, 1, "vnode taken from hash");
+ if ((*vpp)->v_type != vtyp) {
+ /*
+ * STALE vnode! This probably indicates a buggy
+ * server, but it could also be the result of a race
+ * between FUSE_LOOKUP and another client's
+ * FUSE_UNLINK/FUSE_CREATE
+ */
+ SDT_PROBE3(fusefs, , node, stale_vnode, *vpp, vtyp,
+ nodeid);
+ fuse_internal_vnode_disappear(*vpp);
+ lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL);
+ *vpp = NULL;
+ return (EAGAIN);
+ }
+ MPASS((*vpp)->v_data != NULL);
+ MPASS(VTOFUD(*vpp)->nid == nodeid);
+ SDT_PROBE2(fusefs, , node, trace, 1, "vnode taken from hash");
return (0);
}
fvdat = malloc(sizeof(*fvdat), M_FUSEVN, M_WAITOK | M_ZERO);
- err = getnewvnode("fuse", mp, &fuse_vnops, vpp);
+ switch (vtyp) {
+ case VFIFO:
+ err = getnewvnode("fuse", mp, &fuse_fifoops, vpp);
+ break;
+ default:
+ err = getnewvnode("fuse", mp, &fuse_vnops, vpp);
+ break;
+ }
if (err) {
free(fvdat, M_FUSEVN);
return (err);
@@ -249,14 +248,23 @@
err = insmntque(*vpp, mp);
ASSERT_VOP_ELOCKED(*vpp, "fuse_vnode_alloc");
if (err) {
+ lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL);
free(fvdat, M_FUSEVN);
*vpp = NULL;
return (err);
}
+ /* Disallow async reads for fifos because UFS does. I don't know why */
+ if (data->dataflags & FSESS_ASYNC_READ && vtyp != VFIFO)
+ VN_LOCK_ASHARE(*vpp);
+
err = vfs_hash_insert(*vpp, fuse_vnode_hash(nodeid), LK_EXCLUSIVE,
td, &vp2, fuse_vnode_cmp, &nodeid);
- if (err)
+ if (err) {
+ lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL);
+ free(fvdat, M_FUSEVN);
+ *vpp = NULL;
return (err);
+ }
if (vp2 != NULL) {
*vpp = vp2;
return (0);
@@ -277,6 +285,11 @@
enum vtype vtyp)
{
struct thread *td = (cnp != NULL ? cnp->cn_thread : curthread);
+ /*
+ * feo should only be NULL for the root directory, which (when libfuse
+ * is used) always has generation 0
+ */
+ uint64_t generation = feo ? feo->generation : 0;
int err = 0;
err = fuse_vnode_alloc(mp, td, nodeid, vtyp, vpp);
@@ -284,22 +297,28 @@
return err;
}
if (dvp != NULL) {
- MPASS((cnp->cn_flags & ISDOTDOT) == 0);
- MPASS(!(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'));
+ MPASS(cnp && (cnp->cn_flags & ISDOTDOT) == 0);
+ MPASS(cnp &&
+ !(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'));
fuse_vnode_setparent(*vpp, dvp);
}
if (dvp != NULL && cnp != NULL && (cnp->cn_flags & MAKEENTRY) != 0 &&
feo != NULL &&
(feo->entry_valid != 0 || feo->entry_valid_nsec != 0)) {
+ struct timespec timeout;
+
ASSERT_VOP_LOCKED(*vpp, "fuse_vnode_get");
ASSERT_VOP_LOCKED(dvp, "fuse_vnode_get");
- cache_enter(dvp, *vpp, cnp);
+
+ fuse_validity_2_timespec(feo, &timeout);
+ cache_enter_time(dvp, *vpp, cnp, &timeout, NULL);
}
+ VTOFUD(*vpp)->generation = generation;
/*
* In userland, libfuse uses cached lookups for dot and dotdot entries,
* thus it does not really bump the nlookup counter for forget.
- * Follow the same semantic and avoid tu bump it in order to keep
+ * Follow the same semantic and avoid the bump in order to keep
* nlookup counters consistent.
*/
if (cnp == NULL || ((cnp->cn_flags & ISDOTDOT) == 0 &&
@@ -309,44 +328,19 @@
return 0;
}
+/*
+ * Called for every fusefs vnode open to initialize the vnode (not
+ * fuse_filehandle) for use
+ */
void
fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags, struct thread *td)
{
- /*
- * Funcation is called for every vnode open.
- * Merge fuse_open_flags it may be 0
- */
- /*
- * Ideally speaking, direct io should be enabled on
- * fd's but do not see of any way of providing that
- * this implementation.
- *
- * Also cannot think of a reason why would two
- * different fd's on same vnode would like
- * have DIRECT_IO turned on and off. But linux
- * based implementation works on an fd not an
- * inode and provides such a feature.
- *
- * XXXIP: Handle fd based DIRECT_IO
- */
- if (fuse_open_flags & FOPEN_DIRECT_IO) {
- ASSERT_VOP_ELOCKED(vp, __func__);
- VTOFUD(vp)->flag |= FN_DIRECTIO;
- fuse_io_invalbuf(vp, td);
- } else {
- if ((fuse_open_flags & FOPEN_KEEP_CACHE) == 0)
- fuse_io_invalbuf(vp, td);
- VTOFUD(vp)->flag &= ~FN_DIRECTIO;
- }
-
- if (vnode_vtype(vp) == VREG) {
- /* XXXIP prevent getattr, by using cached node size */
+ if (vnode_vtype(vp) == VREG)
vnode_create_vobject(vp, 0, td);
- }
}
int
-fuse_vnode_savesize(struct vnode *vp, struct ucred *cred)
+fuse_vnode_savesize(struct vnode *vp, struct ucred *cred, pid_t pid)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
struct thread *td = curthread;
@@ -375,10 +369,11 @@
fsai->valid = 0;
/* Truncate to a new value. */
- fsai->size = fvdat->filesize;
+ MPASS((fvdat->flag & FN_SIZECHANGE) != 0);
+ fsai->size = fvdat->cached_attrs.va_size;
fsai->valid |= FATTR_SIZE;
- fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh);
+ fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid);
if (fufh) {
fsai->fh = fufh->fh_id;
fsai->valid |= FATTR_FH;
@@ -391,38 +386,116 @@
return err;
}
-void
-fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred)
-{
-
- struct fuse_vnode_data *fvdat = VTOFUD(vp);
- struct vattr va;
-
- if ((fvdat->flag & FN_SIZECHANGE) != 0 ||
- fuse_data_cache_mode == FUSE_CACHE_UC ||
- (fuse_refresh_size == 0 && fvdat->filesize != 0))
- return;
-
- VOP_GETATTR(vp, &va, cred);
- SDT_PROBE2(fuse, , node, trace, 1, "refreshed file size");
-}
-
+/*
+ * Adjust the vnode's size to a new value, such as that provided by
+ * FUSE_GETATTR.
+ */
int
fuse_vnode_setsize(struct vnode *vp, off_t newsize)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
+ struct vattr *attrs;
off_t oldsize;
+ size_t iosize;
+ struct buf *bp = NULL;
int err = 0;
ASSERT_VOP_ELOCKED(vp, "fuse_vnode_setsize");
- oldsize = fvdat->filesize;
- fvdat->filesize = newsize;
- fvdat->flag |= FN_SIZECHANGE;
+ iosize = fuse_iosize(vp);
+ oldsize = fvdat->cached_attrs.va_size;
+ fvdat->cached_attrs.va_size = newsize;
+ if ((attrs = VTOVA(vp)) != NULL)
+ attrs->va_size = newsize;
if (newsize < oldsize) {
+ daddr_t lbn;
+
err = vtruncbuf(vp, newsize, fuse_iosize(vp));
+ if (err)
+ goto out;
+ if (newsize % iosize == 0)
+ goto out;
+ /*
+ * Zero the contents of the last partial block.
+ * Sure seems like vtruncbuf should do this for us.
+ */
+
+ lbn = newsize / iosize;
+ bp = getblk(vp, lbn, iosize, PCATCH, 0, 0);
+ if (!bp) {
+ err = EINTR;
+ goto out;
+ }
+ if (!(bp->b_flags & B_CACHE))
+ goto out; /* Nothing to do */
+ MPASS(bp->b_flags & B_VMIO);
+ vfs_bio_clrbuf(bp);
+ bp->b_dirtyend = MIN(bp->b_dirtyend, newsize - lbn * iosize);
}
+out:
+ if (bp)
+ brelse(bp);
vnode_pager_setsize(vp, newsize);
return err;
+}
+
+/* Get the current, possibly dirty, size of the file */
+int
+fuse_vnode_size(struct vnode *vp, off_t *filesize, struct ucred *cred,
+ struct thread *td)
+{
+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
+ int error = 0;
+
+ if (!(fvdat->flag & FN_SIZECHANGE) &&
+ (VTOVA(vp) == NULL || fvdat->cached_attrs.va_size == VNOVAL))
+ error = fuse_internal_do_getattr(vp, NULL, cred, td);
+
+ if (!error)
+ *filesize = fvdat->cached_attrs.va_size;
+
+ return error;
+}
+
+void
+fuse_vnode_undirty_cached_timestamps(struct vnode *vp)
+{
+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
+
+ fvdat->flag &= ~(FN_MTIMECHANGE | FN_CTIMECHANGE);
+}
+
+/* Update a fuse file's cached timestamps */
+void
+fuse_vnode_update(struct vnode *vp, int flags)
+{
+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
+ struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));
+ struct timespec ts;
+
+ vfs_timestamp(&ts);
+
+ if (data->time_gran > 1)
+ ts.tv_nsec = rounddown(ts.tv_nsec, data->time_gran);
+
+ if (flags & FN_MTIMECHANGE)
+ fvdat->cached_attrs.va_mtime = ts;
+ if (flags & FN_CTIMECHANGE)
+ fvdat->cached_attrs.va_ctime = ts;
+
+ fvdat->flag |= flags;
+}
+
+void
+fuse_node_init(void)
+{
+ fuse_node_count = counter_u64_alloc(M_WAITOK);
+ counter_u64_zero(fuse_node_count);
+}
+
+void
+fuse_node_destroy(void)
+{
+ counter_u64_free(fuse_node_count);
}
Index: sys/fs/fuse/fuse_param.h
===================================================================
--- sys/fs/fuse/fuse_param.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Copyright (c) 2007-2009 Google Inc. and Amit Singh
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Google Inc. nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _FUSE_PARAM_H_
-#define _FUSE_PARAM_H_
-
-/*
- * This is the prefix ("fuse" by default) of the name of a FUSE device node
- * in devfs. The suffix is the device number. "/dev/fuse0" is the first FUSE
- * device by default. If you change the prefix from the default to something
- * else, the user-space FUSE library will need to know about it too.
- */
-#define FUSE_DEVICE_BASENAME "fuse"
-
-/*
- * This is the number of /dev/fuse<n> nodes we will create. <n> goes from
- * 0 to (FUSE_NDEVICES - 1).
- */
-#define FUSE_NDEVICES 16
-
-/*
- * This is the default block size of the virtual storage devices that are
- * implicitly implemented by the FUSE kernel extension. This can be changed
- * on a per-mount basis (there's one such virtual device for each mount).
- */
-#define FUSE_DEFAULT_BLOCKSIZE 4096
-
-/*
- * This is default I/O size used while accessing the virtual storage devices.
- * This can be changed on a per-mount basis.
- */
-#define FUSE_DEFAULT_IOSIZE 4096
-
-#ifdef KERNEL
-
-/*
- * This is the soft upper limit on the number of "request tickets" FUSE's
- * user-kernel IPC layer can have for a given mount. This can be modified
- * through the fuse.* sysctl interface.
- */
-#define FUSE_DEFAULT_MAX_FREE_TICKETS 1024
-
-#define FUSE_DEFAULT_IOV_PERMANENT_BUFSIZE (1L << 19)
-#define FUSE_DEFAULT_IOV_CREDIT 16
-
-#endif
-
-#define FUSE_LINK_MAX UINT32_MAX
-
-#endif /* _FUSE_PARAM_H_ */
Index: sys/fs/fuse/fuse_vfsops.c
===================================================================
--- sys/fs/fuse/fuse_vfsops.c
+++ sys/fs/fuse/fuse_vfsops.c
@@ -33,6 +33,11 @@
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
*
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -81,7 +86,6 @@
#include <sys/fcntl.h>
#include "fuse.h"
-#include "fuse_param.h"
#include "fuse_node.h"
#include "fuse_ipc.h"
#include "fuse_internal.h"
@@ -89,13 +93,13 @@
#include <sys/priv.h>
#include <security/mac/mac_framework.h>
-SDT_PROVIDER_DECLARE(fuse);
+SDT_PROVIDER_DECLARE(fusefs);
/*
* Fuse trace probe:
* arg0: verbosity. Higher numbers give more verbose messages
* arg1: Textual message
*/
-SDT_PROBE_DEFINE2(fuse, , vfsops, trace, "int", "char*");
+SDT_PROBE_DEFINE2(fusefs, , vfsops, trace, "int", "char*");
/* This will do for privilege types for now */
#ifndef PRIV_VFS_FUSE_ALLOWOTHER
@@ -108,30 +112,28 @@
#define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER
#endif
+static vfs_fhtovp_t fuse_vfsop_fhtovp;
static vfs_mount_t fuse_vfsop_mount;
static vfs_unmount_t fuse_vfsop_unmount;
static vfs_root_t fuse_vfsop_root;
static vfs_statfs_t fuse_vfsop_statfs;
+static vfs_vget_t fuse_vfsop_vget;
struct vfsops fuse_vfsops = {
+ .vfs_fhtovp = fuse_vfsop_fhtovp,
.vfs_mount = fuse_vfsop_mount,
.vfs_unmount = fuse_vfsop_unmount,
.vfs_root = fuse_vfsop_root,
.vfs_statfs = fuse_vfsop_statfs,
+ .vfs_vget = fuse_vfsop_vget,
};
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, init_backgrounded, CTLFLAG_RD,
- SYSCTL_NULL_INT_PTR, 1, "indicate async handshake");
static int fuse_enforce_dev_perms = 0;
SYSCTL_INT(_vfs_fusefs, OID_AUTO, enforce_dev_perms, CTLFLAG_RW,
&fuse_enforce_dev_perms, 0,
"enforce fuse device permissions for secondary mounts");
-static unsigned sync_unmount = 1;
-SYSCTL_UINT(_vfs_fusefs, OID_AUTO, sync_unmount, CTLFLAG_RW,
- &sync_unmount, 0, "specify when to use synchronous unmount");
-
MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer");
static int
@@ -208,11 +210,90 @@
vfs_flagopt(opts, "__" #fnam, &__mntopts, fval); \
} while (0)
-SDT_PROBE_DEFINE1(fuse, , vfsops, mntopts, "uint64_t");
-SDT_PROBE_DEFINE4(fuse, , vfsops, mount_err, "char*", "struct fuse_data*",
+SDT_PROBE_DEFINE1(fusefs, , vfsops, mntopts, "uint64_t");
+SDT_PROBE_DEFINE4(fusefs, , vfsops, mount_err, "char*", "struct fuse_data*",
"struct mount*", "int");
static int
+fuse_vfs_remount(struct mount *mp, struct thread *td, uint64_t mntopts,
+ uint32_t max_read, int daemon_timeout)
+{
+ int err = 0;
+ struct fuse_data *data = fuse_get_mpdata(mp);
+ /* Don't allow these options to be changed */
+ const static unsigned long long cant_update_opts =
+ MNT_USER; /* Mount owner must be the user running the daemon */
+
+ FUSE_LOCK();
+
+ if ((mp->mnt_flag ^ data->mnt_flag) & cant_update_opts) {
+ err = EOPNOTSUPP;
+ SDT_PROBE4(fusefs, , vfsops, mount_err,
+ "Can't change these mount options during remount",
+ data, mp, err);
+ goto out;
+ }
+ if (((data->dataflags ^ mntopts) & FSESS_MNTOPTS_MASK) ||
+ (data->max_read != max_read) ||
+ (data->daemon_timeout != daemon_timeout)) {
+ // TODO: allow changing options where it makes sense
+ err = EOPNOTSUPP;
+ SDT_PROBE4(fusefs, , vfsops, mount_err,
+ "Can't change fuse mount options during remount",
+ data, mp, err);
+ goto out;
+ }
+
+ if (fdata_get_dead(data)) {
+ err = ENOTCONN;
+ SDT_PROBE4(fusefs, , vfsops, mount_err,
+ "device is dead during mount", data, mp, err);
+ goto out;
+ }
+
+ /* Sanity + permission checks */
+ if (!data->daemoncred)
+ panic("fuse daemon found, but identity unknown");
+ if (mntopts & FSESS_DAEMON_CAN_SPY)
+ err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER);
+ if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid)
+ /* are we allowed to do the first mount? */
+ err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER);
+
+out:
+ FUSE_UNLOCK();
+ return err;
+}
+
+static int
+fuse_vfsop_fhtovp(struct mount *mp, struct fid *fhp, int flags,
+ struct vnode **vpp)
+{
+ struct fuse_fid *ffhp = (struct fuse_fid *)fhp;
+ struct fuse_vnode_data *fvdat;
+ struct vnode *nvp;
+ int error;
+
+ if (!(fuse_get_mpdata(mp)->dataflags & FSESS_EXPORT_SUPPORT))
+ return EOPNOTSUPP;
+
+ error = VFS_VGET(mp, ffhp->nid, LK_EXCLUSIVE, &nvp);
+ if (error) {
+ *vpp = NULLVP;
+ return (error);
+ }
+ fvdat = VTOFUD(nvp);
+ if (fvdat->generation != ffhp->gen ) {
+ vput(nvp);
+ *vpp = NULLVP;
+ return (ESTALE);
+ }
+ *vpp = nvp;
+ vnode_create_vobject(*vpp, 0, curthread);
+ return (0);
+}
+
+static int
fuse_vfsop_mount(struct mount *mp)
{
int err;
@@ -238,13 +319,6 @@
__mntopts = 0;
td = curthread;
- if (mp->mnt_flag & MNT_UPDATE)
- return EOPNOTSUPP;
-
- MNT_ILOCK(mp);
- mp->mnt_flag |= MNT_SYNCHRONOUS;
- mp->mnt_data = NULL;
- MNT_IUNLOCK(mp);
/* Get the new options passed to mount */
opts = mp->mnt_optnew;
@@ -255,19 +329,6 @@
if (!vfs_getopts(opts, "fspath", &err))
return err;
- /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */
- fspec = vfs_getopts(opts, "from", &err);
- if (!fspec)
- return err;
-
- /* `fd' contains the filedescriptor for this session; REQUIRED */
- if (vfs_scanopt(opts, "fd", "%d", &fd) != 1)
- return EINVAL;
-
- err = fuse_getdevice(fspec, td, &fdev);
- if (err != 0)
- return err;
-
/*
* With the help of underscored options the mount program
* can inform us from the flags it sets by default
@@ -275,12 +336,7 @@
FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY);
FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN);
FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS);
- FUSE_FLAGOPT(no_attrcache, FSESS_NO_ATTRCACHE);
- FUSE_FLAGOPT(no_readahed, FSESS_NO_READAHEAD);
- FUSE_FLAGOPT(no_datacache, FSESS_NO_DATACACHE);
- FUSE_FLAGOPT(no_namecache, FSESS_NO_NAMECACHE);
- FUSE_FLAGOPT(no_mmap, FSESS_NO_MMAP);
- FUSE_FLAGOPT(brokenio, FSESS_BROKENIO);
+ FUSE_FLAGOPT(intr, FSESS_INTR);
(void)vfs_scanopt(opts, "max_read=", "%u", &max_read);
if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) {
@@ -293,11 +349,29 @@
}
subtype = vfs_getopts(opts, "subtype=", &err);
- SDT_PROBE1(fuse, , vfsops, mntopts, mntopts);
+ SDT_PROBE1(fusefs, , vfsops, mntopts, mntopts);
+ if (mp->mnt_flag & MNT_UPDATE) {
+ return fuse_vfs_remount(mp, td, mntopts, max_read,
+ daemon_timeout);
+ }
+
+ /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */
+ fspec = vfs_getopts(opts, "from", &err);
+ if (!fspec)
+ return err;
+
+ /* `fd' contains the filedescriptor for this session; REQUIRED */
+ if (vfs_scanopt(opts, "fd", "%d", &fd) != 1)
+ return EINVAL;
+
+ err = fuse_getdevice(fspec, td, &fdev);
+ if (err != 0)
+ return err;
+
err = fget(td, fd, &cap_read_rights, &fp);
if (err != 0) {
- SDT_PROBE2(fuse, , vfsops, trace, 1,
+ SDT_PROBE2(fusefs, , vfsops, trace, 1,
"invalid or not opened device");
goto out;
}
@@ -307,16 +381,17 @@
td->td_fpop = fptmp;
fdrop(fp, td);
FUSE_LOCK();
- if (err != 0 || data == NULL || data->mp != NULL) {
+
+ if (err != 0 || data == NULL) {
err = ENXIO;
- SDT_PROBE4(fuse, , vfsops, mount_err,
+ SDT_PROBE4(fusefs, , vfsops, mount_err,
"invalid or not opened device", data, mp, err);
FUSE_UNLOCK();
goto out;
}
if (fdata_get_dead(data)) {
err = ENOTCONN;
- SDT_PROBE4(fuse, , vfsops, mount_err,
+ SDT_PROBE4(fusefs, , vfsops, mount_err,
"device is dead during mount", data, mp, err);
FUSE_UNLOCK();
goto out;
@@ -338,12 +413,17 @@
data->dataflags |= mntopts;
data->max_read = max_read;
data->daemon_timeout = daemon_timeout;
+ data->mnt_flag = mp->mnt_flag & MNT_UPDATEMASK;
FUSE_UNLOCK();
vfs_getnewfsid(mp);
MNT_ILOCK(mp);
mp->mnt_data = data;
- mp->mnt_flag |= MNT_LOCAL;
+ /*
+ * FUSE file systems can be either local or remote, but the kernel
+ * can't tell the difference.
+ */
+ mp->mnt_flag &= ~MNT_LOCAL;
mp->mnt_kern_flag |= MNTK_USES_BCACHE;
MNT_IUNLOCK(mp);
/* We need this here as this slot is used by getnewvnode() */
@@ -354,6 +434,7 @@
}
copystr(fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &len);
bzero(mp->mnt_stat.f_mntfromname + len, MNAMELEN - len);
+ mp->mnt_iosize_max = MAXPHYS;
/* Now handshaking with daemon */
fuse_internal_send_init(data, td);
@@ -366,9 +447,10 @@
* Destroy device only if we acquired reference to
* it
*/
- SDT_PROBE4(fuse, , vfsops, mount_err,
+ SDT_PROBE4(fusefs, , vfsops, mount_err,
"mount failed, destroy device", data, mp, err);
data->mp = NULL;
+ mp->mnt_data = NULL;
fdata_trydestroy(data);
}
FUSE_UNLOCK();
@@ -412,11 +494,13 @@
if (fdata_get_dead(data)) {
goto alreadydead;
}
- fdisp_init(&fdi, 0);
- fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL);
+ if (fsess_isimpl(mp, FUSE_DESTROY)) {
+ fdisp_init(&fdi, 0);
+ fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL);
- err = fdisp_wait_answ(&fdi);
- fdisp_destroy(&fdi);
+ (void)fdisp_wait_answ(&fdi);
+ fdisp_destroy(&fdi);
+ }
fdata_set_dead(data);
@@ -429,7 +513,6 @@
MNT_ILOCK(mp);
mp->mnt_data = NULL;
- mp->mnt_flag &= ~MNT_LOCAL;
MNT_IUNLOCK(mp);
dev_rel(fdev);
@@ -437,7 +520,87 @@
return 0;
}
+SDT_PROBE_DEFINE1(fusefs, , vfsops, invalidate_without_export,
+ "struct mount*");
static int
+fuse_vfsop_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp)
+{
+ struct fuse_data *data = fuse_get_mpdata(mp);
+ uint64_t nodeid = ino;
+ struct thread *td = curthread;
+ struct fuse_dispatcher fdi;
+ struct fuse_entry_out *feo;
+ struct fuse_vnode_data *fvdat;
+ const char dot[] = ".";
+ off_t filesize;
+ enum vtype vtyp;
+ int error;
+
+ if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) {
+ /*
+ * Unreachable unless you do something stupid, like export a
+ * nullfs mount of a fusefs file system.
+ */
+ SDT_PROBE1(fusefs, , vfsops, invalidate_without_export, mp);
+ return (EOPNOTSUPP);
+ }
+
+ error = fuse_internal_get_cached_vnode(mp, ino, flags, vpp);
+ if (error || *vpp != NULL)
+ return error;
+
+ /* Do a LOOKUP, using nodeid as the parent and "." as filename */
+ fdisp_init(&fdi, sizeof(dot));
+ fdisp_make(&fdi, FUSE_LOOKUP, mp, nodeid, td, td->td_ucred);
+ memcpy(fdi.indata, dot, sizeof(dot));
+ error = fdisp_wait_answ(&fdi);
+
+ if (error)
+ return error;
+
+ feo = (struct fuse_entry_out *)fdi.answ;
+ if (feo->nodeid == 0) {
+ /* zero nodeid means ENOENT and cache it */
+ error = ENOENT;
+ goto out;
+ }
+
+ vtyp = IFTOVT(feo->attr.mode);
+ error = fuse_vnode_get(mp, feo, nodeid, NULL, vpp, NULL, vtyp);
+ if (error)
+ goto out;
+ filesize = feo->attr.size;
+
+ /*
+ * In the case where we are looking up a FUSE node represented by an
+ * existing cached vnode, and the true size reported by FUSE_LOOKUP
+ * doesn't match the vnode's cached size, then any cached writes beyond
+ * the file's current size are lost.
+ *
+ * We can get here:
+ * * following attribute cache expiration, or
+ * * due a bug in the daemon, or
+ */
+ fvdat = VTOFUD(*vpp);
+ if (vnode_isreg(*vpp) &&
+ filesize != fvdat->cached_attrs.va_size &&
+ fvdat->flag & FN_SIZECHANGE) {
+ printf("%s: WB cache incoherent on %s!\n", __func__,
+ vnode_mount(*vpp)->mnt_stat.f_mntonname);
+
+ fvdat->flag &= ~FN_SIZECHANGE;
+ }
+
+ fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
+ feo->attr_valid_nsec, NULL);
+ fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec,
+ &fvdat->entry_cache_timeout);
+out:
+ fdisp_destroy(&fdi);
+ return error;
+}
+
+static int
fuse_vfsop_root(struct mount *mp, int lkflags, struct vnode **vpp)
{
struct fuse_data *data = fuse_get_mpdata(mp);
@@ -454,13 +617,13 @@
FUSE_LOCK();
MPASS(data->vroot == NULL || data->vroot == *vpp);
if (data->vroot == NULL) {
- SDT_PROBE2(fuse, , vfsops, trace, 1,
+ SDT_PROBE2(fusefs, , vfsops, trace, 1,
"new root vnode");
data->vroot = *vpp;
FUSE_UNLOCK();
vref(*vpp);
} else if (data->vroot != *vpp) {
- SDT_PROBE2(fuse, , vfsops, trace, 1,
+ SDT_PROBE2(fusefs, , vfsops, trace, 1,
"root vnode race");
FUSE_UNLOCK();
VOP_UNLOCK(*vpp, 0);
@@ -523,7 +686,7 @@
sbp->f_files = 0;
sbp->f_ffree = 0;
sbp->f_namemax = 0;
- sbp->f_bsize = FUSE_DEFAULT_BLOCKSIZE;
+ sbp->f_bsize = S_BLKSIZE;
return 0;
}
Index: sys/fs/fuse/fuse_vnops.c
===================================================================
--- sys/fs/fuse/fuse_vnops.c
+++ sys/fs/fuse/fuse_vnops.c
@@ -33,6 +33,11 @@
* Copyright (C) 2005 Csaba Henk.
* All rights reserved.
*
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by BFF Storage Systems, LLC under
+ * sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -102,24 +107,30 @@
#include "fuse_internal.h"
#include "fuse_ipc.h"
#include "fuse_node.h"
-#include "fuse_param.h"
#include "fuse_io.h"
#include <sys/priv.h>
-SDT_PROVIDER_DECLARE(fuse);
+/* Maximum number of hardlinks to a single FUSE file */
+#define FUSE_LINK_MAX UINT32_MAX
+
+SDT_PROVIDER_DECLARE(fusefs);
/*
* Fuse trace probe:
* arg0: verbosity. Higher numbers give more verbose messages
* arg1: Textual message
*/
-SDT_PROBE_DEFINE2(fuse, , vnops, trace, "int", "char*");
+SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*");
/* vnode ops */
static vop_access_t fuse_vnop_access;
+static vop_advlock_t fuse_vnop_advlock;
+static vop_bmap_t fuse_vnop_bmap;
+static vop_close_t fuse_fifo_close;
static vop_close_t fuse_vnop_close;
static vop_create_t fuse_vnop_create;
static vop_deleteextattr_t fuse_vnop_deleteextattr;
+static vop_fdatasync_t fuse_vnop_fdatasync;
static vop_fsync_t fuse_vnop_fsync;
static vop_getattr_t fuse_vnop_getattr;
static vop_getextattr_t fuse_vnop_getextattr;
@@ -144,19 +155,44 @@
static vop_symlink_t fuse_vnop_symlink;
static vop_write_t fuse_vnop_write;
static vop_getpages_t fuse_vnop_getpages;
-static vop_putpages_t fuse_vnop_putpages;
static vop_print_t fuse_vnop_print;
+static vop_vptofh_t fuse_vnop_vptofh;
+struct vop_vector fuse_fifoops = {
+ .vop_default = &fifo_specops,
+ .vop_access = fuse_vnop_access,
+ .vop_close = fuse_fifo_close,
+ .vop_fsync = fuse_vnop_fsync,
+ .vop_getattr = fuse_vnop_getattr,
+ .vop_inactive = fuse_vnop_inactive,
+ .vop_pathconf = fuse_vnop_pathconf,
+ .vop_print = fuse_vnop_print,
+ .vop_read = VOP_PANIC,
+ .vop_reclaim = fuse_vnop_reclaim,
+ .vop_setattr = fuse_vnop_setattr,
+ .vop_write = VOP_PANIC,
+ .vop_vptofh = fuse_vnop_vptofh,
+};
+
struct vop_vector fuse_vnops = {
+ .vop_allocate = VOP_EINVAL,
.vop_default = &default_vnodeops,
.vop_access = fuse_vnop_access,
+ .vop_advlock = fuse_vnop_advlock,
+ .vop_bmap = fuse_vnop_bmap,
.vop_close = fuse_vnop_close,
.vop_create = fuse_vnop_create,
.vop_deleteextattr = fuse_vnop_deleteextattr,
.vop_fsync = fuse_vnop_fsync,
+ .vop_fdatasync = fuse_vnop_fdatasync,
.vop_getattr = fuse_vnop_getattr,
.vop_getextattr = fuse_vnop_getextattr,
.vop_inactive = fuse_vnop_inactive,
+ /*
+ * TODO: implement vop_ioctl after upgrading to protocol 7.16.
+ * FUSE_IOCTL was added in 7.11, but 32-bit compat is broken until
+ * 7.16.
+ */
.vop_link = fuse_vnop_link,
.vop_listextattr = fuse_vnop_listextattr,
.vop_lookup = fuse_vnop_lookup,
@@ -164,6 +200,12 @@
.vop_mknod = fuse_vnop_mknod,
.vop_open = fuse_vnop_open,
.vop_pathconf = fuse_vnop_pathconf,
+ /*
+ * TODO: implement vop_poll after upgrading to protocol 7.21.
+ * FUSE_POLL was added in protocol 7.11, but it's kind of broken until
+ * 7.21, which adds the ability for the client to choose which poll
+ * events it wants, and for a client to deregister a file handle
+ */
.vop_read = fuse_vnop_read,
.vop_readdir = fuse_vnop_readdir,
.vop_readlink = fuse_vnop_readlink,
@@ -177,41 +219,103 @@
.vop_symlink = fuse_vnop_symlink,
.vop_write = fuse_vnop_write,
.vop_getpages = fuse_vnop_getpages,
- .vop_putpages = fuse_vnop_putpages,
.vop_print = fuse_vnop_print,
+ .vop_vptofh = fuse_vnop_vptofh,
};
-static u_long fuse_lookup_cache_hits = 0;
+uma_zone_t fuse_pbuf_zone;
-SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_hits, CTLFLAG_RD,
- &fuse_lookup_cache_hits, 0, "number of positive cache hits in lookup");
+#define fuse_vm_page_lock(m) vm_page_lock((m));
+#define fuse_vm_page_unlock(m) vm_page_unlock((m));
+#define fuse_vm_page_lock_queues() ((void)0)
+#define fuse_vm_page_unlock_queues() ((void)0)
-static u_long fuse_lookup_cache_misses = 0;
+/* Check permission for extattr operations, much like extattr_check_cred */
+static int
+fuse_extattr_check_cred(struct vnode *vp, int ns, struct ucred *cred,
+ struct thread *td, accmode_t accmode)
+{
+ struct mount *mp = vnode_mount(vp);
+ struct fuse_data *data = fuse_get_mpdata(mp);
-SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_misses, CTLFLAG_RD,
- &fuse_lookup_cache_misses, 0, "number of cache misses in lookup");
+ /*
+ * Kernel-invoked always succeeds.
+ */
+ if (cred == NOCRED)
+ return (0);
-int fuse_lookup_cache_enable = 1;
+ /*
+ * Do not allow privileged processes in jail to directly manipulate
+ * system attributes.
+ */
+ switch (ns) {
+ case EXTATTR_NAMESPACE_SYSTEM:
+ if (data->dataflags & FSESS_DEFAULT_PERMISSIONS) {
+ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
+ }
+ /* FALLTHROUGH */
+ case EXTATTR_NAMESPACE_USER:
+ return (fuse_internal_access(vp, accmode, td, cred));
+ default:
+ return (EPERM);
+ }
+}
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, lookup_cache_enable, CTLFLAG_RW,
- &fuse_lookup_cache_enable, 0, "if non-zero, enable lookup cache");
+/* Get a filehandle for a directory */
+static int
+fuse_filehandle_get_dir(struct vnode *vp, struct fuse_filehandle **fufhp,
+ struct ucred *cred, pid_t pid)
+{
+ if (fuse_filehandle_get(vp, FREAD, fufhp, cred, pid) == 0)
+ return 0;
+ return fuse_filehandle_get(vp, FEXEC, fufhp, cred, pid);
+}
-/*
- * XXX: This feature is highly experimental and can bring to instabilities,
- * needs revisiting before to be enabled by default.
- */
-static int fuse_reclaim_revoked = 0;
+/* Send FUSE_FLUSH for this vnode */
+static int
+fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag)
+{
+ struct fuse_flush_in *ffi;
+ struct fuse_filehandle *fufh;
+ struct fuse_dispatcher fdi;
+ struct thread *td = curthread;
+ struct mount *mp = vnode_mount(vp);
+ int err;
-SYSCTL_INT(_vfs_fusefs, OID_AUTO, reclaim_revoked, CTLFLAG_RW,
- &fuse_reclaim_revoked, 0, "");
+ if (!fsess_isimpl(vnode_mount(vp), FUSE_FLUSH))
+ return 0;
-uma_zone_t fuse_pbuf_zone;
+ err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
+ if (err)
+ return err;
-#define fuse_vm_page_lock(m) vm_page_lock((m));
-#define fuse_vm_page_unlock(m) vm_page_unlock((m));
-#define fuse_vm_page_lock_queues() ((void)0)
-#define fuse_vm_page_unlock_queues() ((void)0)
+ fdisp_init(&fdi, sizeof(*ffi));
+ fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred);
+ ffi = fdi.indata;
+ ffi->fh = fufh->fh_id;
+ /*
+ * If the file has a POSIX lock then we're supposed to set lock_owner.
+ * If not, then lock_owner is undefined. So we may as well always set
+ * it.
+ */
+ ffi->lock_owner = td->td_proc->p_pid;
+ err = fdisp_wait_answ(&fdi);
+ if (err == ENOSYS) {
+ fsess_set_notimpl(mp, FUSE_FLUSH);
+ err = 0;
+ }
+ fdisp_destroy(&fdi);
+ return err;
+}
+
+/* Close wrapper for fifos. */
+static int
+fuse_fifo_close(struct vop_close_args *ap)
+{
+ return (fifo_specops.vop_close(ap));
+}
+
/*
struct vnop_access_args {
struct vnode *a_vp;
@@ -231,7 +335,6 @@
int accmode = ap->a_accmode;
struct ucred *cred = ap->a_cred;
- struct fuse_access_param facp;
struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));
int err;
@@ -254,15 +357,192 @@
if (vnode_islnk(vp)) {
return 0;
}
- bzero(&facp, sizeof(facp));
- err = fuse_internal_access(vp, accmode, &facp, ap->a_td, ap->a_cred);
+ err = fuse_internal_access(vp, accmode, ap->a_td, ap->a_cred);
return err;
}
/*
- struct vnop_close_args {
+ * struct vop_advlock_args {
+ * struct vop_generic_args a_gen;
+ * struct vnode *a_vp;
+ * void *a_id;
+ * int a_op;
+ * struct flock *a_fl;
+ * int a_flags;
+ * }
+ */
+static int
+fuse_vnop_advlock(struct vop_advlock_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct flock *fl = ap->a_fl;
+ struct thread *td = curthread;
+ struct ucred *cred = td->td_ucred;
+ pid_t pid = td->td_proc->p_pid;
+ struct fuse_filehandle *fufh;
+ struct fuse_dispatcher fdi;
+ struct fuse_lk_in *fli;
+ struct fuse_lk_out *flo;
+ enum fuse_opcode op;
+ int dataflags, err;
+ int flags = ap->a_flags;
+
+ dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;
+
+ if (fuse_isdeadfs(vp)) {
+ return ENXIO;
+ }
+
+ if (!(dataflags & FSESS_POSIX_LOCKS))
+ return vop_stdadvlock(ap);
+ /* FUSE doesn't properly support flock until protocol 7.17 */
+ if (flags & F_FLOCK)
+ return vop_stdadvlock(ap);
+
+ err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid);
+ if (err)
+ return err;
+
+ fdisp_init(&fdi, sizeof(*fli));
+
+ switch(ap->a_op) {
+ case F_GETLK:
+ op = FUSE_GETLK;
+ break;
+ case F_SETLK:
+ op = FUSE_SETLK;
+ break;
+ case F_SETLKW:
+ op = FUSE_SETLKW;
+ break;
+ default:
+ return EINVAL;
+ }
+
+ fdisp_make_vp(&fdi, op, vp, td, cred);
+ fli = fdi.indata;
+ fli->fh = fufh->fh_id;
+ fli->owner = fl->l_pid;
+ fli->lk.start = fl->l_start;
+ if (fl->l_len != 0)
+ fli->lk.end = fl->l_start + fl->l_len - 1;
+ else
+ fli->lk.end = INT64_MAX;
+ fli->lk.type = fl->l_type;
+ fli->lk.pid = fl->l_pid;
+
+ err = fdisp_wait_answ(&fdi);
+ fdisp_destroy(&fdi);
+
+ if (err == 0 && op == FUSE_GETLK) {
+ flo = fdi.answ;
+ fl->l_type = flo->lk.type;
+ fl->l_pid = flo->lk.pid;
+ if (flo->lk.type != F_UNLCK) {
+ fl->l_start = flo->lk.start;
+ if (flo->lk.end == INT64_MAX)
+ fl->l_len = 0;
+ else
+ fl->l_len = flo->lk.end - flo->lk.start + 1;
+ fl->l_start = flo->lk.start;
+ }
+ }
+
+ return err;
+}
+
+/* {
struct vnode *a_vp;
+ daddr_t a_bn;
+ struct bufobj **a_bop;
+ daddr_t *a_bnp;
+ int *a_runp;
+ int *a_runb;
+} */
+static int
+fuse_vnop_bmap(struct vop_bmap_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct bufobj **bo = ap->a_bop;
+ struct thread *td = curthread;
+ struct mount *mp;
+ struct fuse_dispatcher fdi;
+ struct fuse_bmap_in *fbi;
+ struct fuse_bmap_out *fbo;
+ struct fuse_data *data;
+ uint64_t biosize;
+ off_t filesize;
+ daddr_t lbn = ap->a_bn;
+ daddr_t *pbn = ap->a_bnp;
+ int *runp = ap->a_runp;
+ int *runb = ap->a_runb;
+ int error = 0;
+ int maxrun;
+
+ if (fuse_isdeadfs(vp)) {
+ return ENXIO;
+ }
+
+ mp = vnode_mount(vp);
+ data = fuse_get_mpdata(mp);
+ biosize = fuse_iosize(vp);
+ maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1,
+ data->max_readahead_blocks);
+
+ if (bo != NULL)
+ *bo = &vp->v_bufobj;
+
+ /*
+ * The FUSE_BMAP operation does not include the runp and runb
+ * variables, so we must guess. Report nonzero contiguous runs so
+ * cluster_read will combine adjacent reads. It's worthwhile to reduce
+ * upcalls even if we don't know the true physical layout of the file.
+ *
+ * FUSE file systems may opt out of read clustering in two ways:
+ * * mounting with -onoclusterr
+ * * Setting max_readahead <= maxbcachebuf during FUSE_INIT
+ */
+ if (runb != NULL)
+ *runb = MIN(lbn, maxrun);
+ if (runp != NULL) {
+ error = fuse_vnode_size(vp, &filesize, td->td_ucred, td);
+ if (error == 0)
+ *runp = MIN(MAX(0, filesize / biosize - lbn - 1),
+ maxrun);
+ else
+ *runp = 0;
+ }
+
+ if (fsess_isimpl(mp, FUSE_BMAP)) {
+ fdisp_init(&fdi, sizeof(*fbi));
+ fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred);
+ fbi = fdi.indata;
+ fbi->block = lbn;
+ fbi->blocksize = biosize;
+ error = fdisp_wait_answ(&fdi);
+ if (error == ENOSYS) {
+ fdisp_destroy(&fdi);
+ fsess_set_notimpl(mp, FUSE_BMAP);
+ error = 0;
+ } else {
+ fbo = fdi.answ;
+ if (error == 0 && pbn != NULL)
+ *pbn = fbo->block;
+ fdisp_destroy(&fdi);
+ return error;
+ }
+ }
+
+ /* If the daemon doesn't support BMAP, make up a sensible default */
+ if (pbn != NULL)
+ *pbn = lbn * btodb(biosize);
+ return (error);
+}
+
+/*
+ struct vop_close_args {
+ struct vnode *a_vp;
int a_fflag;
struct ucred *a_cred;
struct thread *a_td;
@@ -274,39 +554,48 @@
struct vnode *vp = ap->a_vp;
struct ucred *cred = ap->a_cred;
int fflag = ap->a_fflag;
- fufh_type_t fufh_type;
+ struct thread *td = ap->a_td;
+ pid_t pid = td->td_proc->p_pid;
+ int err = 0;
- if (fuse_isdeadfs(vp)) {
+ if (fuse_isdeadfs(vp))
return 0;
- }
- if (vnode_isdir(vp)) {
- if (fuse_filehandle_valid(vp, FUFH_RDONLY)) {
- fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred);
- }
+ if (vnode_isdir(vp))
return 0;
- }
- if (fflag & IO_NDELAY) {
+ if (fflag & IO_NDELAY)
return 0;
- }
- fufh_type = fuse_filehandle_xlate_from_fflags(fflag);
- if (!fuse_filehandle_valid(vp, fufh_type)) {
- int i;
-
- for (i = 0; i < FUFH_MAXTYPE; i++)
- if (fuse_filehandle_valid(vp, i))
- break;
- if (i == FUFH_MAXTYPE)
- panic("FUSE: fufh type %d found to be invalid in close"
- " (fflag=0x%x)\n",
- fufh_type, fflag);
- }
+ err = fuse_flush(vp, cred, pid, fflag);
+ /* TODO: close the file handle, if we're sure it's no longer used */
if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
- fuse_vnode_savesize(vp, cred);
+ fuse_vnode_savesize(vp, cred, td->td_proc->p_pid);
}
- return 0;
+ return err;
}
+static void
+fdisp_make_mknod_for_fallback(
+ struct fuse_dispatcher *fdip,
+ struct componentname *cnp,
+ struct vnode *dvp,
+ uint64_t parentnid,
+ struct thread *td,
+ struct ucred *cred,
+ mode_t mode,
+ enum fuse_opcode *op)
+{
+ struct fuse_mknod_in *fmni;
+
+ fdisp_init(fdip, sizeof(*fmni) + cnp->cn_namelen + 1);
+ *op = FUSE_MKNOD;
+ fdisp_make(fdip, *op, vnode_mount(dvp), parentnid, td, cred);
+ fmni = fdip->indata;
+ fmni->mode = mode;
+ fmni->rdev = 0;
+ memcpy((char *)fdip->indata + sizeof(*fmni), cnp->cn_nameptr,
+ cnp->cn_namelen);
+ ((char *)fdip->indata)[sizeof(*fmni) + cnp->cn_namelen] = '\0';
+}
/*
struct vnop_create_args {
struct vnode *a_dvp;
@@ -325,107 +614,169 @@
struct thread *td = cnp->cn_thread;
struct ucred *cred = cnp->cn_cred;
- struct fuse_open_in *foi;
+ struct fuse_data *data;
+ struct fuse_create_in *fci;
struct fuse_entry_out *feo;
- struct fuse_dispatcher fdi;
+ struct fuse_open_out *foo;
+ struct fuse_dispatcher fdi, fdi2;
struct fuse_dispatcher *fdip = &fdi;
+ struct fuse_dispatcher *fdip2 = NULL;
int err;
struct mount *mp = vnode_mount(dvp);
+ data = fuse_get_mpdata(mp);
uint64_t parentnid = VTOFUD(dvp)->nid;
mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode);
- uint64_t x_fh_id;
- uint32_t x_open_flags;
+ enum fuse_opcode op;
+ int flags;
- if (fuse_isdeadfs(dvp)) {
+ if (fuse_isdeadfs(dvp))
return ENXIO;
- }
+
+ /* FUSE expects sockets to be created with FUSE_MKNOD */
+ if (vap->va_type == VSOCK)
+ return fuse_internal_mknod(dvp, vpp, cnp, vap);
+
+ /*
+ * VOP_CREATE doesn't tell us the open(2) flags, so we guess. Only a
+ * writable mode makes sense, and we might as well include readability
+ * too.
+ */
+ flags = O_RDWR;
+
bzero(&fdi, sizeof(fdi));
- /* XXX: Will we ever want devices ? */
- if ((vap->va_type != VREG)) {
- printf("fuse_vnop_create: unsupported va_type %d\n",
- vap->va_type);
+ if (vap->va_type != VREG)
return (EINVAL);
- }
- fdisp_init(fdip, sizeof(*foi) + cnp->cn_namelen + 1);
- if (!fsess_isimpl(mp, FUSE_CREATE)) {
- SDT_PROBE2(fuse, , vnops, trace, 1,
- "eh, daemon doesn't implement create?");
- return (EINVAL);
- }
- fdisp_make(fdip, FUSE_CREATE, vnode_mount(dvp), parentnid, td, cred);
+ if (!fsess_isimpl(mp, FUSE_CREATE) || vap->va_type == VSOCK) {
+ /* Fallback to FUSE_MKNOD/FUSE_OPEN */
+ fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td,
+ cred, mode, &op);
+ } else {
+ /* Use FUSE_CREATE */
+ size_t insize;
- foi = fdip->indata;
- foi->mode = mode;
- foi->flags = O_CREAT | O_RDWR;
+ op = FUSE_CREATE;
+ fdisp_init(fdip, sizeof(*fci) + cnp->cn_namelen + 1);
+ fdisp_make(fdip, op, vnode_mount(dvp), parentnid, td, cred);
+ fci = fdip->indata;
+ fci->mode = mode;
+ fci->flags = O_CREAT | flags;
+ if (fuse_libabi_geq(data, 7, 12)) {
+ insize = sizeof(*fci);
+ fci->umask = td->td_proc->p_fd->fd_cmask;
+ } else {
+ insize = sizeof(struct fuse_open_in);
+ }
- memcpy((char *)fdip->indata + sizeof(*foi), cnp->cn_nameptr,
- cnp->cn_namelen);
- ((char *)fdip->indata)[sizeof(*foi) + cnp->cn_namelen] = '\0';
+ memcpy((char *)fdip->indata + insize, cnp->cn_nameptr,
+ cnp->cn_namelen);
+ ((char *)fdip->indata)[insize + cnp->cn_namelen] = '\0';
+ }
err = fdisp_wait_answ(fdip);
if (err) {
- if (err == ENOSYS)
+ if (err == ENOSYS && op == FUSE_CREATE) {
fsess_set_notimpl(mp, FUSE_CREATE);
- goto out;
+ fdisp_destroy(fdip);
+ fdisp_make_mknod_for_fallback(fdip, cnp, dvp,
+ parentnid, td, cred, mode, &op);
+ err = fdisp_wait_answ(fdip);
+ }
+ if (err)
+ goto out;
}
feo = fdip->answ;
- if ((err = fuse_internal_checkentry(feo, VREG))) {
+ if ((err = fuse_internal_checkentry(feo, vap->va_type))) {
goto out;
}
- err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, VREG);
+
+ if (op == FUSE_CREATE) {
+ foo = (struct fuse_open_out*)(feo + 1);
+ } else {
+ /* Issue a separate FUSE_OPEN */
+ struct fuse_open_in *foi;
+
+ fdip2 = &fdi2;
+ fdisp_init(fdip2, sizeof(*foi));
+ fdisp_make(fdip2, FUSE_OPEN, vnode_mount(dvp), feo->nodeid, td,
+ cred);
+ foi = fdip2->indata;
+ foi->flags = flags;
+ err = fdisp_wait_answ(fdip2);
+ if (err)
+ goto out;
+ foo = fdip2->answ;
+ }
+ err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vap->va_type);
if (err) {
struct fuse_release_in *fri;
uint64_t nodeid = feo->nodeid;
- uint64_t fh_id = ((struct fuse_open_out *)(feo + 1))->fh;
+ uint64_t fh_id = foo->fh;
fdisp_init(fdip, sizeof(*fri));
fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred);
fri = fdip->indata;
fri->fh = fh_id;
- fri->flags = OFLAGS(mode);
+ fri->flags = flags;
fuse_insert_callback(fdip->tick, fuse_internal_forget_callback);
- fuse_insert_message(fdip->tick);
- return err;
+ fuse_insert_message(fdip->tick, false);
+ goto out;
}
ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create");
+ fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
+ feo->attr_valid_nsec, NULL);
- fdip->answ = feo + 1;
-
- x_fh_id = ((struct fuse_open_out *)(feo + 1))->fh;
- x_open_flags = ((struct fuse_open_out *)(feo + 1))->open_flags;
- fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, x_fh_id);
- fuse_vnode_open(*vpp, x_open_flags, td);
+ fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, td, cred, foo);
+ fuse_vnode_open(*vpp, foo->open_flags, td);
+ /*
+ * Purge the parent's attribute cache because the daemon should've
+ * updated its mtime and ctime
+ */
+ fuse_vnode_clear_attr_cache(dvp);
cache_purge_negative(dvp);
out:
+ if (fdip2)
+ fdisp_destroy(fdip2);
fdisp_destroy(fdip);
return err;
}
/*
- * Our vnop_fsync roughly corresponds to the FUSE_FSYNC method. The Linux
- * version of FUSE also has a FUSE_FLUSH method.
- *
- * On Linux, fsync() synchronizes a file's complete in-core state with that
- * on disk. The call is not supposed to return until the system has completed
- * that action or until an error is detected.
- *
- * Linux also has an fdatasync() call that is similar to fsync() but is not
- * required to update the metadata such as access time and modification time.
- */
+ struct vnop_fdatasync_args {
+ struct vop_generic_args a_gen;
+ struct vnode * a_vp;
+ struct thread * a_td;
+ };
+*/
+static int
+fuse_vnop_fdatasync(struct vop_fdatasync_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct thread *td = ap->a_td;
+ int waitfor = MNT_WAIT;
+ int err = 0;
+
+ if (fuse_isdeadfs(vp)) {
+ return 0;
+ }
+ if ((err = vop_stdfdatasync_buf(ap)))
+ return err;
+
+ return fuse_internal_fsync(vp, td, waitfor, true);
+}
+
/*
struct vnop_fsync_args {
- struct vnodeop_desc *a_desc;
+ struct vop_generic_args a_gen;
struct vnode * a_vp;
- struct ucred * a_cred;
int a_waitfor;
struct thread * a_td;
};
@@ -435,31 +786,16 @@
{
struct vnode *vp = ap->a_vp;
struct thread *td = ap->a_td;
+ int waitfor = ap->a_waitfor;
+ int err = 0;
- struct fuse_filehandle *fufh;
- struct fuse_vnode_data *fvdat = VTOFUD(vp);
-
- int type, err = 0;
-
if (fuse_isdeadfs(vp)) {
return 0;
}
if ((err = vop_stdfsync(ap)))
return err;
- if (!fsess_isimpl(vnode_mount(vp),
- (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) {
- goto out;
- }
- for (type = 0; type < FUFH_MAXTYPE; type++) {
- fufh = &(fvdat->fufh[type]);
- if (FUFH_IS_VALID(fufh)) {
- fuse_internal_fsync(vp, td, NULL, fufh);
- }
- }
-
-out:
- return 0;
+ return fuse_internal_fsync(vp, td, waitfor, false);
}
/*
@@ -477,12 +813,9 @@
struct vattr *vap = ap->a_vap;
struct ucred *cred = ap->a_cred;
struct thread *td = curthread;
- struct fuse_vnode_data *fvdat = VTOFUD(vp);
- struct fuse_attr_out *fao;
int err = 0;
int dataflags;
- struct fuse_dispatcher fdi;
dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;
@@ -497,48 +830,14 @@
goto fake;
}
}
- fdisp_init(&fdi, 0);
- if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) {
- if ((err == ENOTCONN) && vnode_isvroot(vp)) {
- /* see comment in fuse_vfsop_statfs() */
- fdisp_destroy(&fdi);
- goto fake;
- }
- if (err == ENOENT) {
- fuse_internal_vnode_disappear(vp);
- }
- goto out;
+ err = fuse_internal_getattr(vp, vap, cred, td);
+ if (err == ENOTCONN && vnode_isvroot(vp)) {
+ /* see comment in fuse_vfsop_statfs() */
+ goto fake;
+ } else {
+ return err;
}
- fao = (struct fuse_attr_out *)fdi.answ;
- fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
- fao->attr_valid_nsec, vap);
- if (vap->va_type != vnode_vtype(vp)) {
- fuse_internal_vnode_disappear(vp);
- err = ENOENT;
- goto out;
- }
- if ((fvdat->flag & FN_SIZECHANGE) != 0)
- vap->va_size = fvdat->filesize;
-
- if (vnode_isreg(vp) && (fvdat->flag & FN_SIZECHANGE) == 0) {
- /*
- * This is for those cases when the file size changed without us
- * knowing, and we want to catch up.
- */
- off_t new_filesize = ((struct fuse_attr_out *)
- fdi.answ)->attr.size;
-
- if (fvdat->filesize != new_filesize) {
- fuse_vnode_setsize(vp, new_filesize);
- fvdat->flag &= ~FN_SIZECHANGE;
- }
- }
-
-out:
- fdisp_destroy(&fdi);
- return err;
-
fake:
bzero(vap, sizeof(*vap));
vap->va_type = vnode_vtype(vp);
@@ -559,31 +858,27 @@
struct thread *td = ap->a_td;
struct fuse_vnode_data *fvdat = VTOFUD(vp);
- struct fuse_filehandle *fufh = NULL;
+ struct fuse_filehandle *fufh, *fufh_tmp;
- int type, need_flush = 1;
+ int need_flush = 1;
- for (type = 0; type < FUFH_MAXTYPE; type++) {
- fufh = &(fvdat->fufh[type]);
- if (FUFH_IS_VALID(fufh)) {
- if (need_flush && vp->v_type == VREG) {
- if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
- fuse_vnode_savesize(vp, NULL);
- }
- if (fuse_data_cache_invalidate ||
- (fvdat->flag & FN_REVOKED) != 0)
- fuse_io_invalbuf(vp, td);
- else
- fuse_io_flushbuf(vp, MNT_WAIT, td);
- need_flush = 0;
+ LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) {
+ if (need_flush && vp->v_type == VREG) {
+ if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
+ fuse_vnode_savesize(vp, NULL, 0);
}
- fuse_filehandle_close(vp, type, td, NULL);
+ if ((fvdat->flag & FN_REVOKED) != 0)
+ fuse_io_invalbuf(vp, td);
+ else
+ fuse_io_flushbuf(vp, MNT_WAIT, td);
+ need_flush = 0;
}
+ fuse_filehandle_close(vp, fufh, td, NULL);
}
- if ((fvdat->flag & FN_REVOKED) != 0 && fuse_reclaim_revoked) {
+ if ((fvdat->flag & FN_REVOKED) != 0)
vrecycle(vp);
- }
+
return 0;
}
@@ -635,11 +930,39 @@
feo = fdi.answ;
err = fuse_internal_checkentry(feo, vnode_vtype(vp));
+ if (!err) {
+ /*
+ * Purge the parent's attribute cache because the daemon
+ * should've updated its mtime and ctime
+ */
+ fuse_vnode_clear_attr_cache(tdvp);
+ fuse_internal_cache_attrs(vp, &feo->attr, feo->attr_valid,
+ feo->attr_valid_nsec, NULL);
+ }
out:
fdisp_destroy(&fdi);
return err;
}
+struct fuse_lookup_alloc_arg {
+ struct fuse_entry_out *feo;
+ struct componentname *cnp;
+ uint64_t nid;
+ enum vtype vtyp;
+};
+
+/* Callback for vn_get_ino */
+static int
+fuse_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
+{
+ struct fuse_lookup_alloc_arg *flaa = arg;
+
+ return fuse_vnode_get(mp, flaa->feo, flaa->nid, NULL, vpp, flaa->cnp,
+ flaa->vtyp);
+}
+
+SDT_PROBE_DEFINE3(fusefs, , vnops, cache_lookup,
+ "int", "struct timespec*", "struct timespec*");
/*
struct vnop_lookup_args {
struct vnodeop_desc *a_desc;
@@ -668,268 +991,146 @@
struct vnode *vp = NULL;
struct fuse_dispatcher fdi;
- enum fuse_opcode op;
+ bool did_lookup = false;
+ struct fuse_entry_out *feo = NULL;
+ enum vtype vtyp; /* vnode type of target */
+ off_t filesize; /* filesize of target */
uint64_t nid;
- struct fuse_access_param facp;
if (fuse_isdeadfs(dvp)) {
*vpp = NULL;
return ENXIO;
}
- if (!vnode_isdir(dvp)) {
+ if (!vnode_isdir(dvp))
return ENOTDIR;
- }
- if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) {
+
+ if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP))
return EROFS;
- }
- /*
- * We do access check prior to doing anything else only in the case
- * when we are at fs root (we'd like to say, "we are at the first
- * component", but that's not exactly the same... nevermind).
- * See further comments at further access checks.
- */
- bzero(&facp, sizeof(facp));
- if (vnode_isvroot(dvp)) { /* early permission check hack */
- if ((err = fuse_internal_access(dvp, VEXEC, &facp, td, cred))) {
- return err;
- }
- }
+ if ((err = fuse_internal_access(dvp, VEXEC, td, cred)))
+ return err;
+
if (flags & ISDOTDOT) {
+ KASSERT(VTOFUD(dvp)->flag & FN_PARENT_NID,
+ ("Looking up .. is TODO"));
nid = VTOFUD(dvp)->parent_nid;
- if (nid == 0) {
+ if (nid == 0)
return ENOENT;
- }
- fdisp_init(&fdi, 0);
- op = FUSE_GETATTR;
- goto calldaemon;
+ /* .. is obviously a directory */
+ vtyp = VDIR;
+ filesize = 0;
} else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') {
nid = VTOI(dvp);
- fdisp_init(&fdi, 0);
- op = FUSE_GETATTR;
- goto calldaemon;
- } else if (fuse_lookup_cache_enable) {
- err = cache_lookup(dvp, vpp, cnp, NULL, NULL);
- switch (err) {
+ /* . is obviously a directory */
+ vtyp = VDIR;
+ filesize = 0;
+ } else {
+ struct timespec now, timeout;
+ err = cache_lookup(dvp, vpp, cnp, &timeout, NULL);
+ getnanouptime(&now);
+ SDT_PROBE3(fusefs, , vnops, cache_lookup, err, &timeout, &now);
+ switch (err) {
case -1: /* positive match */
- atomic_add_acq_long(&fuse_lookup_cache_hits, 1);
+ if (timespeccmp(&timeout, &now, >)) {
+ counter_u64_add(fuse_lookup_cache_hits, 1);
+ } else {
+ /* Cache timeout */
+ counter_u64_add(fuse_lookup_cache_misses, 1);
+ bintime_clear(
+ &VTOFUD(*vpp)->entry_cache_timeout);
+ cache_purge(*vpp);
+ if (dvp != *vpp)
+ vput(*vpp);
+ else
+ vrele(*vpp);
+ *vpp = NULL;
+ break;
+ }
return 0;
case 0: /* no match in cache */
- atomic_add_acq_long(&fuse_lookup_cache_misses, 1);
+ counter_u64_add(fuse_lookup_cache_misses, 1);
break;
case ENOENT: /* negative match */
+ getnanouptime(&now);
+ if (timespeccmp(&timeout, &now, <=)) {
+ /* Cache timeout */
+ cache_purge_negative(dvp);
+ break;
+ }
/* fall through */
default:
return err;
}
- }
- nid = VTOI(dvp);
- fdisp_init(&fdi, cnp->cn_namelen + 1);
- op = FUSE_LOOKUP;
-calldaemon:
- fdisp_make(&fdi, op, mp, nid, td, cred);
+ nid = VTOI(dvp);
+ fdisp_init(&fdi, cnp->cn_namelen + 1);
+ fdisp_make(&fdi, FUSE_LOOKUP, mp, nid, td, cred);
- if (op == FUSE_LOOKUP) {
memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
((char *)fdi.indata)[cnp->cn_namelen] = '\0';
- }
- lookup_err = fdisp_wait_answ(&fdi);
+ lookup_err = fdisp_wait_answ(&fdi);
+ did_lookup = true;
- if ((op == FUSE_LOOKUP) && !lookup_err) { /* lookup call succeeded */
- nid = ((struct fuse_entry_out *)fdi.answ)->nodeid;
- if (!nid) {
- /*
- * zero nodeid is the same as "not found",
- * but it's also cacheable (which we keep
- * keep on doing not as of writing this)
- */
- lookup_err = ENOENT;
- } else if (nid == FUSE_ROOT_ID) {
- lookup_err = EINVAL;
+ if (!lookup_err) {
+ /* lookup call succeeded */
+ feo = (struct fuse_entry_out *)fdi.answ;
+ nid = feo->nodeid;
+ if (nid == 0) {
+ /* zero nodeid means ENOENT and cache it */
+ struct timespec timeout;
+
+ fdi.answ_stat = ENOENT;
+ lookup_err = ENOENT;
+ if (cnp->cn_flags & MAKEENTRY) {
+ fuse_validity_2_timespec(feo, &timeout);
+ cache_enter_time(dvp, *vpp, cnp,
+ &timeout, NULL);
+ }
+ } else if (nid == FUSE_ROOT_ID) {
+ lookup_err = EINVAL;
+ }
+ vtyp = IFTOVT(feo->attr.mode);
+ filesize = feo->attr.size;
}
+ if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT)) {
+ fdisp_destroy(&fdi);
+ return lookup_err;
+ }
}
- if (lookup_err &&
- (!fdi.answ_stat || lookup_err != ENOENT || op != FUSE_LOOKUP)) {
- fdisp_destroy(&fdi);
- return lookup_err;
- }
/* lookup_err, if non-zero, must be ENOENT at this point */
if (lookup_err) {
+ /* Entry not found */
+ if ((nameiop == CREATE || nameiop == RENAME) && islastcn) {
+ err = fuse_internal_access(dvp, VWRITE, td, cred);
+ if (!err) {
+ /*
+ * Set the SAVENAME flag to hold onto the
+ * pathname for use later in VOP_CREATE or
+ * VOP_RENAME.
+ */
+ cnp->cn_flags |= SAVENAME;
- if ((nameiop == CREATE || nameiop == RENAME) && islastcn
- /* && directory dvp has not been removed */ ) {
-
- if (vfs_isrdonly(mp)) {
- err = EROFS;
- goto out;
+ err = EJUSTRETURN;
}
-#if 0 /* THINK_ABOUT_THIS */
- if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) {
- goto out;
- }
-#endif
-
- /*
- * Possibly record the position of a slot in the
- * directory large enough for the new component name.
- * This can be recorded in the vnode private data for
- * dvp. Set the SAVENAME flag to hold onto the
- * pathname for use later in VOP_CREATE or VOP_RENAME.
- */
- cnp->cn_flags |= SAVENAME;
-
- err = EJUSTRETURN;
- goto out;
- }
- /* Consider inserting name into cache. */
-
- /*
- * No we can't use negative caching, as the fs
- * changes are out of our control.
- * False positives' falseness turns out just as things
- * go by, but false negatives' falseness doesn't.
- * (and aiding the caching mechanism with extra control
- * mechanisms comes quite close to beating the whole purpose
- * caching...)
- */
-#if 0
- if ((cnp->cn_flags & MAKEENTRY) != 0) {
- SDT_PROBE2(fuse, , vnops, trace, 1,
- "inserting NULL into cache");
- cache_enter(dvp, NULL, cnp);
- }
-#endif
- err = ENOENT;
- goto out;
-
- } else {
-
- /* !lookup_err */
-
- struct fuse_entry_out *feo = NULL;
- struct fuse_attr *fattr = NULL;
-
- if (op == FUSE_GETATTR) {
- fattr = &((struct fuse_attr_out *)fdi.answ)->attr;
} else {
- feo = (struct fuse_entry_out *)fdi.answ;
- fattr = &(feo->attr);
+ err = ENOENT;
}
-
- /*
- * If deleting, and at end of pathname, return parameters
- * which can be used to remove file. If the wantparent flag
- * isn't set, we return only the directory, otherwise we go on
- * and lock the inode, being careful with ".".
- */
- if (nameiop == DELETE && islastcn) {
- /*
- * Check for write access on directory.
- */
- facp.xuid = fattr->uid;
- facp.facc_flags |= FACCESS_STICKY;
- err = fuse_internal_access(dvp, VWRITE, &facp, td, cred);
- facp.facc_flags &= ~FACCESS_XQUERIES;
-
- if (err) {
- goto out;
- }
- if (nid == VTOI(dvp)) {
- vref(dvp);
- *vpp = dvp;
- } else {
- err = fuse_vnode_get(dvp->v_mount, feo, nid,
- dvp, &vp, cnp, IFTOVT(fattr->mode));
- if (err)
- goto out;
- *vpp = vp;
- }
-
- /*
- * Save the name for use in VOP_RMDIR and VOP_REMOVE
- * later.
- */
- cnp->cn_flags |= SAVENAME;
- goto out;
-
- }
- /*
- * If rewriting (RENAME), return the inode and the
- * information required to rewrite the present directory
- * Must get inode of directory entry to verify it's a
- * regular file, or empty directory.
- */
- if (nameiop == RENAME && wantparent && islastcn) {
-
-#if 0 /* THINK_ABOUT_THIS */
- if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) {
- goto out;
- }
-#endif
-
- /*
- * Check for "."
- */
- if (nid == VTOI(dvp)) {
- err = EISDIR;
- goto out;
- }
- err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp,
- &vp, cnp, IFTOVT(fattr->mode));
- if (err) {
- goto out;
- }
- *vpp = vp;
- /*
- * Save the name for use in VOP_RENAME later.
- */
- cnp->cn_flags |= SAVENAME;
-
- goto out;
- }
+ } else {
+ /* Entry was found */
if (flags & ISDOTDOT) {
- struct mount *mp;
- int ltype;
+ struct fuse_lookup_alloc_arg flaa;
- /*
- * Expanded copy of vn_vget_ino() so that
- * fuse_vnode_get() can be used.
- */
- mp = dvp->v_mount;
- ltype = VOP_ISLOCKED(dvp);
- err = vfs_busy(mp, MBF_NOWAIT);
- if (err != 0) {
- vfs_ref(mp);
- VOP_UNLOCK(dvp, 0);
- err = vfs_busy(mp, 0);
- vn_lock(dvp, ltype | LK_RETRY);
- vfs_rel(mp);
- if (err)
- goto out;
- if ((dvp->v_iflag & VI_DOOMED) != 0) {
- err = ENOENT;
- vfs_unbusy(mp);
- goto out;
- }
- }
- VOP_UNLOCK(dvp, 0);
- err = fuse_vnode_get(vnode_mount(dvp), feo, nid, NULL,
- &vp, cnp, IFTOVT(fattr->mode));
- vfs_unbusy(mp);
- vn_lock(dvp, ltype | LK_RETRY);
- if ((dvp->v_iflag & VI_DOOMED) != 0) {
- if (err == 0)
- vput(vp);
- err = ENOENT;
- }
- if (err)
- goto out;
+ flaa.nid = nid;
+ flaa.feo = feo;
+ flaa.cnp = cnp;
+ flaa.vtyp = vtyp;
+ err = vn_vget_ino_gen(dvp, fuse_lookup_alloc, &flaa, 0,
+ &vp);
*vpp = vp;
} else if (nid == VTOI(dvp)) {
vref(dvp);
@@ -938,25 +1139,26 @@
struct fuse_vnode_data *fvdat;
err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp,
- &vp, cnp, IFTOVT(fattr->mode));
- if (err) {
+ &vp, cnp, vtyp);
+ if (err)
goto out;
- }
- fuse_vnode_setparent(vp, dvp);
+ *vpp = vp;
/*
* In the case where we are looking up a FUSE node
* represented by an existing cached vnode, and the
* true size reported by FUSE_LOOKUP doesn't match
- * the vnode's cached size, fix the vnode cache to
- * match the real object size.
+ * the vnode's cached size, then any cached writes
+ * beyond the file's current size are lost.
*
- * This can occur via FUSE distributed filesystems,
- * irregular files, etc.
+ * We can get here:
+ * * following attribute cache expiration, or
+ * * due a bug in the daemon, or
*/
fvdat = VTOFUD(vp);
if (vnode_isreg(vp) &&
- fattr->size != fvdat->filesize) {
+ filesize != fvdat->cached_attrs.va_size &&
+ fvdat->flag & FN_SIZECHANGE) {
/*
* The FN_SIZECHANGE flag reflects a dirty
* append. If userspace lets us know our cache
@@ -966,131 +1168,64 @@
*
* XXX: Maybe disable WB caching on this mount.
*/
- if (fvdat->flag & FN_SIZECHANGE)
- printf("%s: WB cache incoherent on "
- "%s!\n", __func__,
- vnode_mount(vp)->mnt_stat.f_mntonname);
+ printf("%s: WB cache incoherent on %s!\n",
+ __func__,
+ vnode_mount(vp)->mnt_stat.f_mntonname);
- (void)fuse_vnode_setsize(vp, fattr->size);
fvdat->flag &= ~FN_SIZECHANGE;
}
- *vpp = vp;
- }
- if (op == FUSE_GETATTR) {
- struct fuse_attr_out *fao =
- (struct fuse_attr_out*)fdi.answ;
- fuse_internal_cache_attrs(*vpp,
- &fao->attr, fao->attr_valid,
- fao->attr_valid_nsec, NULL);
- } else {
- struct fuse_entry_out *feo =
- (struct fuse_entry_out*)fdi.answ;
- fuse_internal_cache_attrs(*vpp,
- &feo->attr, feo->attr_valid,
- feo->attr_valid_nsec, NULL);
- }
+ MPASS(feo != NULL);
+ fuse_internal_cache_attrs(*vpp, &feo->attr,
+ feo->attr_valid, feo->attr_valid_nsec, NULL);
+ fuse_validity_2_bintime(feo->entry_valid,
+ feo->entry_valid_nsec,
+ &fvdat->entry_cache_timeout);
- /* Insert name into cache if appropriate. */
+ if ((nameiop == DELETE || nameiop == RENAME) &&
+ islastcn)
+ {
+ struct vattr dvattr;
- /*
- * Nooo, caching is evil. With caching, we can't avoid stale
- * information taking over the playground (cached info is not
- * just positive/negative, it does have qualitative aspects,
- * too). And a (VOP/FUSE)_GETATTR is always thrown anyway, when
- * walking down along cached path components, and that's not
- * any cheaper than FUSE_LOOKUP. This might change with
- * implementing kernel side attr caching, but... In Linux,
- * lookup results are not cached, and the daemon is bombarded
- * with FUSE_LOOKUPS on and on. This shows that by design, the
- * daemon is expected to handle frequent lookup queries
- * efficiently, do its caching in userspace, and so on.
- *
- * So just leave the name cache alone.
- */
-
- /*
- * Well, now I know, Linux caches lookups, but with a
- * timeout... So it's the same thing as attribute caching:
- * we can deal with it when implement timeouts.
- */
-#if 0
- if (cnp->cn_flags & MAKEENTRY) {
- cache_enter(dvp, *vpp, cnp);
- }
-#endif
- }
-out:
- if (!lookup_err) {
-
- /* No lookup error; need to clean up. */
-
- if (err) { /* Found inode; exit with no vnode. */
- if (op == FUSE_LOOKUP) {
- fuse_internal_forget_send(vnode_mount(dvp), td, cred,
- nid, 1);
- }
- fdisp_destroy(&fdi);
- return err;
- } else {
-#ifndef NO_EARLY_PERM_CHECK_HACK
- if (!islastcn) {
- /*
- * We have the attributes of the next item
- * *now*, and it's a fact, and we do not
- * have to do extra work for it (ie, beg the
- * daemon), and it neither depends on such
- * accidental things like attr caching. So
- * the big idea: check credentials *now*,
- * not at the beginning of the next call to
- * lookup.
- *
- * The first item of the lookup chain (fs root)
- * won't be checked then here, of course, as
- * its never "the next". But go and see that
- * the root is taken care about at the very
- * beginning of this function.
- *
- * Now, given we want to do the access check
- * this way, one might ask: so then why not
- * do the access check just after fetching
- * the inode and its attributes from the
- * daemon? Why bother with producing the
- * corresponding vnode at all if something
- * is not OK? We know what's the deal as
- * soon as we get those attrs... There is
- * one bit of info though not given us by
- * the daemon: whether his response is
- * authoritative or not... His response should
- * be ignored if something is mounted over
- * the dir in question. But that can be
- * known only by having the vnode...
+ err = fuse_internal_access(dvp, VWRITE, td,
+ cred);
+ if (err != 0)
+ goto out;
+ /*
+ * if the parent's sticky bit is set, check
+ * whether we're allowed to remove the file.
+ * Need to figure out the vnode locking to make
+ * this work.
*/
- int tmpvtype = vnode_vtype(*vpp);
-
- bzero(&facp, sizeof(facp));
- /*the early perm check hack */
- facp.facc_flags |= FACCESS_VA_VALID;
-
- if ((tmpvtype != VDIR) && (tmpvtype != VLNK)) {
- err = ENOTDIR;
+ fuse_internal_getattr(dvp, &dvattr, cred, td);
+ if ((dvattr.va_mode & S_ISTXT) &&
+ fuse_internal_access(dvp, VADMIN, td,
+ cred) &&
+ fuse_internal_access(*vpp, VADMIN, td,
+ cred)) {
+ err = EPERM;
+ goto out;
}
- if (!err && !vnode_mountedhere(*vpp)) {
- err = fuse_internal_access(*vpp, VEXEC, &facp, td, cred);
- }
- if (err) {
- if (tmpvtype == VLNK)
- SDT_PROBE2(fuse, , vnops, trace,
- 1, "weird, permission "
- "error with a symlink?");
- vput(*vpp);
- *vpp = NULL;
- }
}
-#endif
+
+ if (islastcn && (
+ (nameiop == DELETE) ||
+ (nameiop == RENAME && wantparent))) {
+ cnp->cn_flags |= SAVENAME;
+ }
+
}
}
- fdisp_destroy(&fdi);
+out:
+ if (err) {
+ if (vp != NULL && dvp != vp)
+ vput(vp);
+ else if (vp != NULL)
+ vrele(vp);
+ *vpp = NULL;
+ }
+ if (did_lookup)
+ fdisp_destroy(&fdi);
return err;
}
@@ -1117,6 +1252,7 @@
return ENXIO;
}
fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ fmdi.umask = curthread->td_proc->p_fd->fd_cmask;
return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi,
sizeof(fmdi), VDIR));
@@ -1134,12 +1270,19 @@
fuse_vnop_mknod(struct vop_mknod_args *ap)
{
- return (EINVAL);
-}
+ struct vnode *dvp = ap->a_dvp;
+ struct vnode **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ struct vattr *vap = ap->a_vap;
+ if (fuse_isdeadfs(dvp))
+ return ENXIO;
+ return fuse_internal_mknod(dvp, vpp, cnp, vap);
+}
+
/*
- struct vnop_open_args {
+ struct vop_open_args {
struct vnode *a_vp;
int a_mode;
struct ucred *a_cred;
@@ -1151,50 +1294,27 @@
fuse_vnop_open(struct vop_open_args *ap)
{
struct vnode *vp = ap->a_vp;
- int mode = ap->a_mode;
+ int a_mode = ap->a_mode;
struct thread *td = ap->a_td;
struct ucred *cred = ap->a_cred;
-
- fufh_type_t fufh_type;
+ pid_t pid = td->td_proc->p_pid;
struct fuse_vnode_data *fvdat;
- int error, isdir = 0;
- int32_t fuse_open_flags;
-
- if (fuse_isdeadfs(vp)) {
+ if (fuse_isdeadfs(vp))
return ENXIO;
- }
- if ((mode & (FREAD | FWRITE)) == 0)
+ if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO)
+ return (EOPNOTSUPP);
+ if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0)
return EINVAL;
fvdat = VTOFUD(vp);
- if (vnode_isdir(vp)) {
- isdir = 1;
- }
- fuse_open_flags = 0;
- if (isdir) {
- fufh_type = FUFH_RDONLY;
- } else {
- fufh_type = fuse_filehandle_xlate_from_fflags(mode);
- /*
- * For WRONLY opens, force DIRECT_IO. This is necessary
- * since writing a partial block through the buffer cache
- * will result in a read of the block and that read won't
- * be allowed by the WRONLY open.
- */
- if (fufh_type == FUFH_WRONLY ||
- (fvdat->flag & FN_DIRECTIO) != 0)
- fuse_open_flags = FOPEN_DIRECT_IO;
- }
-
- if (fuse_filehandle_validrw(vp, fufh_type) != FUFH_INVALID) {
- fuse_vnode_open(vp, fuse_open_flags, td);
+ if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) {
+ fuse_vnode_open(vp, 0, td);
return 0;
}
- error = fuse_filehandle_open(vp, fufh_type, NULL, td, cred);
- return error;
+ return fuse_filehandle_open(vp, a_mode, NULL, td, cred);
}
static int
@@ -1237,6 +1357,7 @@
struct uio *uio = ap->a_uio;
int ioflag = ap->a_ioflag;
struct ucred *cred = ap->a_cred;
+ pid_t pid = curthread->td_proc->p_pid;
if (fuse_isdeadfs(vp)) {
return ENXIO;
@@ -1246,7 +1367,7 @@
ioflag |= IO_DIRECT;
}
- return fuse_io_dispatch(vp, uio, ioflag, cred);
+ return fuse_io_dispatch(vp, uio, ioflag, cred, pid);
}
/*
@@ -1255,7 +1376,7 @@
struct uio *a_uio;
struct ucred *a_cred;
int *a_eofflag;
- int *ncookies;
+ int *a_ncookies;
u_long **a_cookies;
};
*/
@@ -1265,13 +1386,18 @@
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
struct ucred *cred = ap->a_cred;
-
struct fuse_filehandle *fufh = NULL;
struct fuse_iov cookediov;
-
int err = 0;
- int freefufh = 0;
+ u_long *cookies;
+ off_t startoff;
+ ssize_t tresid;
+ int ncookies;
+ bool closefufh = false;
+ pid_t pid = curthread->td_proc->p_pid;
+ if (ap->a_eofflag)
+ *ap->a_eofflag = 0;
if (fuse_isdeadfs(vp)) {
return ENXIO;
}
@@ -1280,26 +1406,61 @@
return EINVAL;
}
- if (!fuse_filehandle_valid(vp, FUFH_RDONLY)) {
- SDT_PROBE2(fuse, , vnops, trace, 1,
- "calling readdir() before open()");
- err = fuse_filehandle_open(vp, FUFH_RDONLY, &fufh, NULL, cred);
- freefufh = 1;
- } else {
- err = fuse_filehandle_get(vp, FUFH_RDONLY, &fufh);
+ tresid = uio->uio_resid;
+ startoff = uio->uio_offset;
+ err = fuse_filehandle_get_dir(vp, &fufh, cred, pid);
+ if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) {
+ /*
+ * nfsd will do VOP_READDIR without first doing VOP_OPEN. We
+ * must implicitly open the directory here
+ */
+ err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred);
+ if (err == 0) {
+ /*
+ * When a directory is opened, it must be read from
+ * the beginning. Hopefully, the "startoff" still
+ * exists as an offset cookie for the directory.
+ * If not, it will read the entire directory without
+ * returning any entries and just return eof.
+ */
+ uio->uio_offset = 0;
+ }
+ closefufh = true;
}
- if (err) {
+ if (err)
return (err);
+ if (ap->a_ncookies != NULL) {
+ ncookies = uio->uio_resid /
+ (offsetof(struct dirent, d_name) + 4) + 1;
+ cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK);
+ *ap->a_ncookies = ncookies;
+ *ap->a_cookies = cookies;
+ } else {
+ ncookies = 0;
+ cookies = NULL;
}
#define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1)
fiov_init(&cookediov, DIRCOOKEDSIZE);
- err = fuse_internal_readdir(vp, uio, fufh, &cookediov);
+ err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov,
+ &ncookies, cookies);
fiov_teardown(&cookediov);
- if (freefufh) {
- fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred);
+ if (closefufh)
+ fuse_filehandle_close(vp, fufh, curthread, cred);
+
+ if (ap->a_ncookies != NULL) {
+ if (err == 0) {
+ *ap->a_ncookies -= ncookies;
+ } else {
+ free(*ap->a_cookies, M_TEMP);
+ *ap->a_ncookies = 0;
+ *ap->a_cookies = NULL;
+ }
}
+ if (err == 0 && tresid == uio->uio_resid)
+ *ap->a_eofflag = 1;
+
return err;
}
@@ -1356,22 +1517,16 @@
{
struct vnode *vp = ap->a_vp;
struct thread *td = ap->a_td;
-
struct fuse_vnode_data *fvdat = VTOFUD(vp);
- struct fuse_filehandle *fufh = NULL;
+ struct fuse_filehandle *fufh, *fufh_tmp;
- int type;
-
if (!fvdat) {
panic("FUSE: no vnode data during recycling");
}
- for (type = 0; type < FUFH_MAXTYPE; type++) {
- fufh = &(fvdat->fufh[type]);
- if (FUFH_IS_VALID(fufh)) {
- printf("FUSE: vnode being reclaimed but fufh (type=%d) is valid",
- type);
- fuse_filehandle_close(vp, type, td, NULL);
- }
+ LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) {
+ printf("FUSE: vnode being reclaimed with open fufh "
+ "(type=%#x)", fufh->fufh_type);
+ fuse_filehandle_close(vp, fufh, td, NULL);
}
if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) {
@@ -1409,12 +1564,9 @@
if (vnode_isdir(vp)) {
return EPERM;
}
- cache_purge(vp);
err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK);
- if (err == 0)
- fuse_internal_vnode_disappear(vp);
return err;
}
@@ -1438,7 +1590,8 @@
struct vnode *tvp = ap->a_tvp;
struct componentname *tcnp = ap->a_tcnp;
struct fuse_data *data;
-
+ bool newparent = fdvp != tdvp;
+ bool isdir = fvp->v_type == VDIR;
int err = 0;
if (fuse_isdeadfs(fdvp)) {
@@ -1446,7 +1599,7 @@
}
if (fvp->v_mount != tdvp->v_mount ||
(tvp && fvp->v_mount != tvp->v_mount)) {
- SDT_PROBE2(fuse, , vnops, trace, 1, "cross-device rename");
+ SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename");
err = EXDEV;
goto out;
}
@@ -1457,7 +1610,17 @@
* under the source directory in the file system tree.
* Linux performs this check at VFS level.
*/
+ /*
+ * If source is a directory, and it will get a new parent, user must
+ * have write permission to it, so ".." can be modified.
+ */
data = fuse_get_mpdata(vnode_mount(tdvp));
+ if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) {
+ err = fuse_internal_access(fvp, VWRITE,
+ tcnp->cn_thread, tcnp->cn_cred);
+ if (err)
+ goto out;
+ }
sx_xlock(&data->rename_lock);
err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp);
if (err == 0) {
@@ -1515,8 +1678,6 @@
}
err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR);
- if (err == 0)
- fuse_internal_vnode_disappear(vp);
return err;
}
@@ -1535,129 +1696,137 @@
struct vattr *vap = ap->a_vap;
struct ucred *cred = ap->a_cred;
struct thread *td = curthread;
+ struct mount *mp;
+ struct fuse_data *data;
+ struct vattr old_va;
+ int dataflags;
+ int err = 0, err2;
+ accmode_t accmode = 0;
+ bool checkperm;
+ bool drop_suid = false;
+ gid_t cr_gid;
- struct fuse_dispatcher fdi;
- struct fuse_setattr_in *fsai;
- struct fuse_access_param facp;
+ mp = vnode_mount(vp);
+ data = fuse_get_mpdata(mp);
+ dataflags = data->dataflags;
+ checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS;
+ if (cred->cr_ngroups > 0)
+ cr_gid = cred->cr_groups[0];
+ else
+ cr_gid = 0;
- int err = 0;
- enum vtype vtyp;
- int sizechanged = 0;
- uint64_t newsize = 0;
-
if (fuse_isdeadfs(vp)) {
return ENXIO;
}
- fdisp_init(&fdi, sizeof(*fsai));
- fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred);
- fsai = fdi.indata;
- fsai->valid = 0;
- bzero(&facp, sizeof(facp));
-
- facp.xuid = vap->va_uid;
- facp.xgid = vap->va_gid;
-
if (vap->va_uid != (uid_t)VNOVAL) {
- facp.facc_flags |= FACCESS_CHOWN;
- fsai->uid = vap->va_uid;
- fsai->valid |= FATTR_UID;
+ if (checkperm) {
+ /* Only root may change a file's owner */
+ err = priv_check_cred(cred, PRIV_VFS_CHOWN);
+ if (err) {
+ /* As a special case, allow the null chown */
+ err2 = fuse_internal_getattr(vp, &old_va, cred,
+ td);
+ if (err2)
+ return (err2);
+ if (vap->va_uid != old_va.va_uid)
+ return err;
+ else
+ accmode |= VADMIN;
+ drop_suid = true;
+ } else
+ accmode |= VADMIN;
+ } else
+ accmode |= VADMIN;
}
if (vap->va_gid != (gid_t)VNOVAL) {
- facp.facc_flags |= FACCESS_CHOWN;
- fsai->gid = vap->va_gid;
- fsai->valid |= FATTR_GID;
+ if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN))
+ drop_suid = true;
+ if (checkperm && !groupmember(vap->va_gid, cred))
+ {
+ /*
+ * Non-root users may only chgrp to one of their own
+ * groups
+ */
+ err = priv_check_cred(cred, PRIV_VFS_CHOWN);
+ if (err) {
+ /* As a special case, allow the null chgrp */
+ err2 = fuse_internal_getattr(vp, &old_va, cred,
+ td);
+ if (err2)
+ return (err2);
+ if (vap->va_gid != old_va.va_gid)
+ return err;
+ accmode |= VADMIN;
+ } else
+ accmode |= VADMIN;
+ } else
+ accmode |= VADMIN;
}
if (vap->va_size != VNOVAL) {
-
- struct fuse_filehandle *fufh = NULL;
-
- /*Truncate to a new value. */
- fsai->size = vap->va_size;
- sizechanged = 1;
- newsize = vap->va_size;
- fsai->valid |= FATTR_SIZE;
-
- fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh);
- if (fufh) {
- fsai->fh = fufh->fh_id;
- fsai->valid |= FATTR_FH;
+ switch (vp->v_type) {
+ case VDIR:
+ return (EISDIR);
+ case VLNK:
+ case VREG:
+ if (vfs_isrdonly(mp))
+ return (EROFS);
+ break;
+ default:
+ /*
+ * According to POSIX, the result is unspecified
+ * for file types other than regular files,
+ * directories and shared memory objects. We
+ * don't support shared memory objects in the file
+ * system, and have dubious support for truncating
+ * symlinks. Just ignore the request in other cases.
+ */
+ return (0);
}
+ /* Don't set accmode. Permission to trunc is checked upstack */
}
- if (vap->va_atime.tv_sec != VNOVAL) {
- fsai->atime = vap->va_atime.tv_sec;
- fsai->atimensec = vap->va_atime.tv_nsec;
- fsai->valid |= FATTR_ATIME;
+ if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+ if (vap->va_vaflags & VA_UTIMES_NULL)
+ accmode |= VWRITE;
+ else
+ accmode |= VADMIN;
}
- if (vap->va_mtime.tv_sec != VNOVAL) {
- fsai->mtime = vap->va_mtime.tv_sec;
- fsai->mtimensec = vap->va_mtime.tv_nsec;
- fsai->valid |= FATTR_MTIME;
+ if (drop_suid) {
+ if (vap->va_mode != (mode_t)VNOVAL)
+ vap->va_mode &= ~(S_ISUID | S_ISGID);
+ else {
+ err = fuse_internal_getattr(vp, &old_va, cred, td);
+ if (err)
+ return (err);
+ vap->va_mode = old_va.va_mode & ~(S_ISUID | S_ISGID);
+ }
}
if (vap->va_mode != (mode_t)VNOVAL) {
- fsai->mode = vap->va_mode & ALLPERMS;
- fsai->valid |= FATTR_MODE;
+ /* Only root may set the sticky bit on non-directories */
+ if (checkperm && vp->v_type != VDIR && (vap->va_mode & S_ISTXT)
+ && priv_check_cred(cred, PRIV_VFS_STICKYFILE))
+ return EFTYPE;
+ if (checkperm && (vap->va_mode & S_ISGID)) {
+ err = fuse_internal_getattr(vp, &old_va, cred, td);
+ if (err)
+ return (err);
+ if (!groupmember(old_va.va_gid, cred)) {
+ err = priv_check_cred(cred, PRIV_VFS_SETGID);
+ if (err)
+ return (err);
+ }
+ }
+ accmode |= VADMIN;
}
- if (!fsai->valid) {
- goto out;
- }
- vtyp = vnode_vtype(vp);
- if (fsai->valid & FATTR_SIZE && vtyp == VDIR) {
- err = EISDIR;
- goto out;
- }
- if (vfs_isrdonly(vnode_mount(vp)) && (fsai->valid & ~FATTR_SIZE || vtyp == VREG)) {
- err = EROFS;
- goto out;
- }
- if (fsai->valid & ~FATTR_SIZE) {
- /*err = fuse_internal_access(vp, VADMIN, context, &facp); */
- /*XXX */
- err = 0;
- }
- facp.facc_flags &= ~FACCESS_XQUERIES;
+ if (vfs_isrdonly(mp))
+ return EROFS;
- if (err && !(fsai->valid & ~(FATTR_ATIME | FATTR_MTIME)) &&
- vap->va_vaflags & VA_UTIMES_NULL) {
- err = fuse_internal_access(vp, VWRITE, &facp, td, cred);
- }
+ err = fuse_internal_access(vp, accmode, td, cred);
if (err)
- goto out;
- if ((err = fdisp_wait_answ(&fdi)))
- goto out;
- vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode);
-
- if (vnode_vtype(vp) != vtyp) {
- if (vnode_vtype(vp) == VNON && vtyp != VNON) {
- SDT_PROBE2(fuse, , vnops, trace, 1, "FUSE: Dang! "
- "vnode_vtype is VNON and vtype isn't.");
- } else {
- /*
- * STALE vnode, ditch
- *
- * The vnode has changed its type "behind our back".
- * There's nothing really we can do, so let us just
- * force an internal revocation and tell the caller to
- * try again, if interested.
- */
- fuse_internal_vnode_disappear(vp);
- err = EAGAIN;
- }
- }
- if (err == 0) {
- struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ;
- fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
- fao->attr_valid_nsec, NULL);
- }
-
-out:
- fdisp_destroy(&fdi);
- if (!err && sizechanged) {
- fuse_vnode_setsize(vp, newsize);
- VTOFUD(vp)->flag &= ~FN_SIZECHANGE;
- }
- return err;
+ return err;
+ else
+ return fuse_internal_setattr(vp, vap, td, cred);
}
/*
@@ -1676,22 +1845,15 @@
bp->b_ioflags |= BIO_ERROR;
bp->b_error = ENXIO;
bufdone(bp);
- return ENXIO;
+ return 0;
}
- if (bp->b_iocmd == BIO_WRITE)
- fuse_vnode_refreshsize(vp, NOCRED);
- (void)fuse_io_strategy(vp, bp);
-
/*
- * This is a dangerous function. If returns error, that might mean a
- * panic. We prefer pretty much anything over being forced to panic
- * by a malicious daemon (a demon?). So we just return 0 anyway. You
- * should never mind this: this function has its own error
- * propagation mechanism via the argument buffer, so
- * not-that-melodramatic residents of the call chain still will be
- * able to know what to do.
+ * VOP_STRATEGY always returns zero and signals error via bp->b_ioflags.
+ * fuse_io_strategy sets bp's error fields
*/
+ (void)fuse_io_strategy(vp, bp);
+
return 0;
}
@@ -1757,237 +1919,70 @@
struct uio *uio = ap->a_uio;
int ioflag = ap->a_ioflag;
struct ucred *cred = ap->a_cred;
+ pid_t pid = curthread->td_proc->p_pid;
if (fuse_isdeadfs(vp)) {
return ENXIO;
}
- fuse_vnode_refreshsize(vp, cred);
if (VTOFUD(vp)->flag & FN_DIRECTIO) {
ioflag |= IO_DIRECT;
}
- return fuse_io_dispatch(vp, uio, ioflag, cred);
+ return fuse_io_dispatch(vp, uio, ioflag, cred, pid);
}
-SDT_PROBE_DEFINE1(fuse, , vnops, vnop_getpages_error, "int");
-/*
- struct vnop_getpages_args {
- struct vnode *a_vp;
- vm_page_t *a_m;
- int a_count;
- int a_reqpage;
- };
-*/
-static int
-fuse_vnop_getpages(struct vop_getpages_args *ap)
+static daddr_t
+fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
{
- int i, error, nextoff, size, toff, count, npages;
- struct uio uio;
- struct iovec iov;
- vm_offset_t kva;
- struct buf *bp;
- struct vnode *vp;
- struct thread *td;
- struct ucred *cred;
- vm_page_t *pages;
+ const int biosize = fuse_iosize(vp);
- vp = ap->a_vp;
- KASSERT(vp->v_object, ("objectless vp passed to getpages"));
- td = curthread; /* XXX */
- cred = curthread->td_ucred; /* XXX */
- pages = ap->a_m;
- npages = ap->a_count;
+ return (off / biosize);
+}
- if (!fsess_opt_mmap(vnode_mount(vp))) {
- SDT_PROBE2(fuse, , vnops, trace, 1,
- "called on non-cacheable vnode??\n");
- return (VM_PAGER_ERROR);
- }
+static int
+fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn)
+{
+ off_t filesize;
+ int blksz, err;
+ const int biosize = fuse_iosize(vp);
- /*
- * If the last page is partially valid, just return it and allow
- * the pager to zero-out the blanks. Partially valid pages can
- * only occur at the file EOF.
- *
- * XXXGL: is that true for FUSE, which is a local filesystem,
- * but still somewhat disconnected from the kernel?
- */
- VM_OBJECT_WLOCK(vp->v_object);
- if (pages[npages - 1]->valid != 0 && --npages == 0)
- goto out;
- VM_OBJECT_WUNLOCK(vp->v_object);
+ err = fuse_vnode_size(vp, &filesize, NULL, NULL);
+ KASSERT(err == 0, ("vfs_bio_getpages can't handle errors here"));
+ if (err)
+ return biosize;
- /*
- * We use only the kva address for the buffer, but this is extremely
- * convenient and fast.
- */
- bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK);
-
- kva = (vm_offset_t)bp->b_data;
- pmap_qenter(kva, pages, npages);
- VM_CNT_INC(v_vnodein);
- VM_CNT_ADD(v_vnodepgsin, npages);
-
- count = npages << PAGE_SHIFT;
- iov.iov_base = (caddr_t)kva;
- iov.iov_len = count;
- uio.uio_iov = &iov;
- uio.uio_iovcnt = 1;
- uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
- uio.uio_resid = count;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_rw = UIO_READ;
- uio.uio_td = td;
-
- error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred);
- pmap_qremove(kva, npages);
-
- uma_zfree(fuse_pbuf_zone, bp);
-
- if (error && (uio.uio_resid == count)) {
- SDT_PROBE1(fuse, , vnops, vnop_getpages_error, error);
- return VM_PAGER_ERROR;
+ if ((off_t)lbn * biosize >= filesize) {
+ blksz = 0;
+ } else if ((off_t)(lbn + 1) * biosize > filesize) {
+ blksz = filesize - (off_t)lbn *biosize;
+ } else {
+ blksz = biosize;
}
- /*
- * Calculate the number of bytes read and validate only that number
- * of bytes. Note that due to pending writes, size may be 0. This
- * does not mean that the remaining data is invalid!
- */
-
- size = count - uio.uio_resid;
- VM_OBJECT_WLOCK(vp->v_object);
- fuse_vm_page_lock_queues();
- for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
- vm_page_t m;
-
- nextoff = toff + PAGE_SIZE;
- m = pages[i];
-
- if (nextoff <= size) {
- /*
- * Read operation filled an entire page
- */
- m->valid = VM_PAGE_BITS_ALL;
- KASSERT(m->dirty == 0,
- ("fuse_getpages: page %p is dirty", m));
- } else if (size > toff) {
- /*
- * Read operation filled a partial page.
- */
- m->valid = 0;
- vm_page_set_valid_range(m, 0, size - toff);
- KASSERT(m->dirty == 0,
- ("fuse_getpages: page %p is dirty", m));
- } else {
- /*
- * Read operation was short. If no error occurred
- * we may have hit a zero-fill section. We simply
- * leave valid set to 0.
- */
- ;
- }
- }
- fuse_vm_page_unlock_queues();
-out:
- VM_OBJECT_WUNLOCK(vp->v_object);
- if (ap->a_rbehind)
- *ap->a_rbehind = 0;
- if (ap->a_rahead)
- *ap->a_rahead = 0;
- return (VM_PAGER_OK);
+ return (blksz);
}
/*
- struct vnop_putpages_args {
+ struct vnop_getpages_args {
struct vnode *a_vp;
vm_page_t *a_m;
int a_count;
- int a_sync;
- int *a_rtvals;
- vm_ooffset_t a_offset;
+ int a_reqpage;
};
*/
static int
-fuse_vnop_putpages(struct vop_putpages_args *ap)
+fuse_vnop_getpages(struct vop_getpages_args *ap)
{
- struct uio uio;
- struct iovec iov;
- vm_offset_t kva;
- struct buf *bp;
- int i, error, npages, count;
- off_t offset;
- int *rtvals;
- struct vnode *vp;
- struct thread *td;
- struct ucred *cred;
- vm_page_t *pages;
- vm_ooffset_t fsize;
+ struct vnode *vp = ap->a_vp;
- vp = ap->a_vp;
- KASSERT(vp->v_object, ("objectless vp passed to putpages"));
- fsize = vp->v_object->un_pager.vnp.vnp_size;
- td = curthread; /* XXX */
- cred = curthread->td_ucred; /* XXX */
- pages = ap->a_m;
- count = ap->a_count;
- rtvals = ap->a_rtvals;
- npages = btoc(count);
- offset = IDX_TO_OFF(pages[0]->pindex);
-
if (!fsess_opt_mmap(vnode_mount(vp))) {
- SDT_PROBE2(fuse, , vnops, trace, 1,
+ SDT_PROBE2(fusefs, , vnops, trace, 1,
"called on non-cacheable vnode??\n");
+ return (VM_PAGER_ERROR);
}
- for (i = 0; i < npages; i++)
- rtvals[i] = VM_PAGER_AGAIN;
- /*
- * When putting pages, do not extend file past EOF.
- */
-
- if (offset + count > fsize) {
- count = fsize - offset;
- if (count < 0)
- count = 0;
- }
- /*
- * We use only the kva address for the buffer, but this is extremely
- * convenient and fast.
- */
- bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK);
-
- kva = (vm_offset_t)bp->b_data;
- pmap_qenter(kva, pages, npages);
- VM_CNT_INC(v_vnodeout);
- VM_CNT_ADD(v_vnodepgsout, count);
-
- iov.iov_base = (caddr_t)kva;
- iov.iov_len = count;
- uio.uio_iov = &iov;
- uio.uio_iovcnt = 1;
- uio.uio_offset = offset;
- uio.uio_resid = count;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_rw = UIO_WRITE;
- uio.uio_td = td;
-
- error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred);
-
- pmap_qremove(kva, npages);
- uma_zfree(fuse_pbuf_zone, bp);
-
- if (!error) {
- int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
-
- for (i = 0; i < nwritten; i++) {
- rtvals[i] = VM_PAGER_OK;
- VM_OBJECT_WLOCK(pages[i]->object);
- vm_page_undirty(pages[i]);
- VM_OBJECT_WUNLOCK(pages[i]->object);
- }
- }
- return rtvals[0];
+ return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
+ ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz));
}
static const char extattr_namespace_separator = '.';
@@ -2023,6 +2018,13 @@
if (fuse_isdeadfs(vp))
return (ENXIO);
+ if (!fsess_isimpl(mp, FUSE_GETXATTR))
+ return EOPNOTSUPP;
+
+ err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD);
+ if (err)
+ return err;
+
/* Default to looking for user attributes. */
if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
@@ -2053,8 +2055,10 @@
err = fdisp_wait_answ(&fdi);
if (err != 0) {
- if (err == ENOSYS)
+ if (err == ENOSYS) {
fsess_set_notimpl(mp, FUSE_GETXATTR);
+ err = EOPNOTSUPP;
+ }
goto out;
}
@@ -2100,6 +2104,29 @@
if (fuse_isdeadfs(vp))
return (ENXIO);
+ if (!fsess_isimpl(mp, FUSE_SETXATTR))
+ return EOPNOTSUPP;
+
+ if (vfs_isrdonly(mp))
+ return EROFS;
+
+ /* Deleting xattrs must use VOP_DELETEEXTATTR instead */
+ if (ap->a_uio == NULL) {
+ /*
+ * If we got here as fallback from VOP_DELETEEXTATTR, then
+ * return EOPNOTSUPP.
+ */
+ if (!fsess_isimpl(mp, FUSE_REMOVEXATTR))
+ return (EOPNOTSUPP);
+ else
+ return (EINVAL);
+ }
+
+ err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td,
+ VWRITE);
+ if (err)
+ return err;
+
/* Default to looking for user attributes. */
if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
@@ -2127,11 +2154,14 @@
err = fdisp_wait_answ(&fdi);
- if (err != 0) {
- if (err == ENOSYS)
- fsess_set_notimpl(mp, FUSE_SETXATTR);
- goto out;
+ if (err == ENOSYS) {
+ fsess_set_notimpl(mp, FUSE_SETXATTR);
+ err = EOPNOTSUPP;
}
+ if (err == ERESTART) {
+ /* Can't restart after calling uiomove */
+ err = EINTR;
+ }
out:
fdisp_destroy(&fdi);
@@ -2227,6 +2257,13 @@
if (fuse_isdeadfs(vp))
return (ENXIO);
+ if (!fsess_isimpl(mp, FUSE_LISTXATTR))
+ return EOPNOTSUPP;
+
+ err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD);
+ if (err)
+ return err;
+
/*
* Add space for a NUL and the period separator if enabled.
* Default to looking for user attributes.
@@ -2251,8 +2288,10 @@
err = fdisp_wait_answ(&fdi);
if (err != 0) {
- if (err == ENOSYS)
+ if (err == ENOSYS) {
fsess_set_notimpl(mp, FUSE_LISTXATTR);
+ err = EOPNOTSUPP;
+ }
goto out;
}
@@ -2267,7 +2306,7 @@
/*
* Retrieve Linux / FUSE compatible list values.
*/
- fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
+ fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
list_xattr_in = fdi.indata;
list_xattr_in->size = linux_list_len + sizeof(*list_xattr_out);
attr_str = (char *)fdi.indata + sizeof(*list_xattr_in);
@@ -2330,6 +2369,17 @@
if (fuse_isdeadfs(vp))
return (ENXIO);
+ if (!fsess_isimpl(mp, FUSE_REMOVEXATTR))
+ return EOPNOTSUPP;
+
+ if (vfs_isrdonly(mp))
+ return EROFS;
+
+ err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td,
+ VWRITE);
+ if (err)
+ return err;
+
/* Default to looking for user attributes. */
if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
@@ -2347,9 +2397,9 @@
ap->a_name);
err = fdisp_wait_answ(&fdi);
- if (err != 0) {
- if (err == ENOSYS)
- fsess_set_notimpl(mp, FUSE_REMOVEXATTR);
+ if (err == ENOSYS) {
+ fsess_set_notimpl(mp, FUSE_REMOVEXATTR);
+ err = EOPNOTSUPP;
}
fdisp_destroy(&fdi);
@@ -2373,3 +2423,48 @@
return 0;
}
+
+/*
+ * Get an NFS filehandle for a FUSE file.
+ *
+ * This will only work for FUSE file systems that guarantee the uniqueness of
+ * nodeid:generation, which most don't.
+ */
+/*
+vop_vptofh {
+ IN struct vnode *a_vp;
+ IN struct fid *a_fhp;
+};
+*/
+static int
+fuse_vnop_vptofh(struct vop_vptofh_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
+ struct fuse_fid *fhp = (struct fuse_fid *)(ap->a_fhp);
+ _Static_assert(sizeof(struct fuse_fid) <= sizeof(struct fid),
+ "FUSE fid type is too big");
+ struct mount *mp = vnode_mount(vp);
+ struct fuse_data *data = fuse_get_mpdata(mp);
+ struct vattr va;
+ int err;
+
+ if (!(data->dataflags & FSESS_EXPORT_SUPPORT))
+ return EOPNOTSUPP;
+
+ err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread);
+ if (err)
+ return err;
+
+ /*ip = VTOI(ap->a_vp);*/
+ /*ufhp = (struct ufid *)ap->a_fhp;*/
+ fhp->len = sizeof(struct fuse_fid);
+ fhp->nid = fvdat->nid;
+ if (fvdat->generation <= UINT32_MAX)
+ fhp->gen = fvdat->generation;
+ else
+ return EOVERFLOW;
+ return (0);
+}
+
+
Index: sys/kern/vfs_cache.c
===================================================================
--- sys/kern/vfs_cache.c
+++ sys/kern/vfs_cache.c
@@ -1964,7 +1964,7 @@
}
/*
- * Invalidate all entries to a particular vnode.
+ * Invalidate all entries from and to a particular vnode.
*/
void
cache_purge(struct vnode *vp)
Index: sys/kern/vfs_subr.c
===================================================================
--- sys/kern/vfs_subr.c
+++ sys/kern/vfs_subr.c
@@ -118,6 +118,8 @@
static void vfs_knl_assert_unlocked(void *arg);
static void vnlru_return_batches(struct vfsops *mnt_op);
static void destroy_vpollinfo(struct vpollinfo *vi);
+static int v_inval_buf_range1(struct vnode *vp, struct bufobj *bo,
+ daddr_t startlbn, daddr_t endlbn);
/*
* These fences are intended for cases where some synchronization is
@@ -945,6 +947,12 @@
* desirable to reuse such vnodes. These conditions may cause the
* number of vnodes to reach some minimum value regardless of what
* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
+ *
+ * @param mp Try to reclaim vnodes from this mountpoint
+ * @param reclaim_nc_src Only reclaim directories with outgoing namecache
+ * entries if this argument is strue
+ * @param reclaim_free Only reclaim free vnodes if this is set.
+ * @return The number of vnodes that were reclaimed.
*/
static int
vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
@@ -1954,9 +1962,8 @@
vtruncbuf(struct vnode *vp, off_t length, int blksize)
{
struct buf *bp, *nbp;
- int anyfreed;
- daddr_t trunclbn;
struct bufobj *bo;
+ daddr_t startlbn;
CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
vp, blksize, (uintmax_t)length);
@@ -1964,22 +1971,114 @@
/*
* Round up to the *next* lbn.
*/
- trunclbn = howmany(length, blksize);
+ startlbn = howmany(length, blksize);
ASSERT_VOP_LOCKED(vp, "vtruncbuf");
+
restart:
bo = &vp->v_bufobj;
BO_LOCK(bo);
+ if (v_inval_buf_range1(vp, bo, startlbn, INT64_MAX) == EAGAIN)
+ goto restart;
+
+ if (length > 0) {
+restartsync:
+ TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+ if (bp->b_lblkno > 0)
+ continue;
+ /*
+ * Since we hold the vnode lock this should only
+ * fail if we're racing with the buf daemon.
+ */
+ if (BUF_LOCK(bp,
+ LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+ BO_LOCKPTR(bo)) == ENOLCK) {
+ goto restart;
+ }
+ VNASSERT((bp->b_flags & B_DELWRI), vp,
+ ("buf(%p) on dirty queue without DELWRI", bp));
+
+ bremfree(bp);
+ bawrite(bp);
+ BO_LOCK(bo);
+ goto restartsync;
+ }
+ }
+
+ bufobj_wwait(bo, 0, 0);
+ BO_UNLOCK(bo);
+ vnode_pager_setsize(vp, length);
+
+ return (0);
+}
+
+/*
+ * Invalidate the cached pages of a file's buffer within the range of block
+ * numbers [startlbn, endlbn). Every buffer that overlaps that range will be
+ * invalidated. This must not result in any dirty data being lost.
+ */
+void
+v_inval_buf_range(struct vnode *vp, off_t start, off_t end, int blksize)
+{
+ struct bufobj *bo;
+ daddr_t startlbn, endlbn;
+ vm_pindex_t startp, endp;
+
+ /* Round "outwards" */
+ startlbn = start / blksize;
+ endlbn = howmany(end, blksize);
+ startp = OFF_TO_IDX(start);
+ endp = OFF_TO_IDX(end + PAGE_SIZE - 1);
+
+ ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
+
+restart:
+ bo = &vp->v_bufobj;
+ BO_LOCK(bo);
+
+#ifdef INVARIANTS
+ struct buf *bp, *nbp;
+
+ TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+ /*
+ * Disallow invalidating dirty data outside of the requested
+ * offsets. Assume that data within the requested offsets is
+ * being invalidated for a good reason.
+ */
+ off_t blkstart, blkend;
+
+ blkstart = bp->b_offset;
+ blkend = bp->b_offset + bp->b_bcount;
+ KASSERT(blkstart >= start && blkend <= end,
+ ("Invalidating extra dirty data!"));
+ }
+#endif
+
+ if (v_inval_buf_range1(vp, bo, startlbn, endlbn) == EAGAIN)
+ goto restart;
+
+ BO_UNLOCK(bo);
+ vn_pages_remove(vp, startp, endp);
+}
+
+/* Like v_inval_buf_range, but operates on whole buffers instead of offsets */
+static int
+v_inval_buf_range1(struct vnode *vp, struct bufobj *bo,
+ daddr_t startlbn, daddr_t endlbn)
+{
+ struct buf *bp, *nbp;
+ int anyfreed;
+
anyfreed = 1;
for (;anyfreed;) {
anyfreed = 0;
TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
- if (bp->b_lblkno < trunclbn)
+ if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
continue;
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
BO_LOCKPTR(bo)) == ENOLCK)
- goto restart;
+ return EAGAIN;
bremfree(bp);
bp->b_flags |= (B_INVAL | B_RELBUF);
@@ -1993,17 +2092,17 @@
(nbp->b_vp != vp) ||
(nbp->b_flags & B_DELWRI))) {
BO_UNLOCK(bo);
- goto restart;
+ return EAGAIN;
}
}
TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
- if (bp->b_lblkno < trunclbn)
+ if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
continue;
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
BO_LOCKPTR(bo)) == ENOLCK)
- goto restart;
+ return EAGAIN;
bremfree(bp);
bp->b_flags |= (B_INVAL | B_RELBUF);
bp->b_flags &= ~B_ASYNC;
@@ -2016,40 +2115,11 @@
(nbp->b_vp != vp) ||
(nbp->b_flags & B_DELWRI) == 0)) {
BO_UNLOCK(bo);
- goto restart;
+ return EAGAIN;
}
}
}
-
- if (length > 0) {
-restartsync:
- TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
- if (bp->b_lblkno > 0)
- continue;
- /*
- * Since we hold the vnode lock this should only
- * fail if we're racing with the buf daemon.
- */
- if (BUF_LOCK(bp,
- LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
- BO_LOCKPTR(bo)) == ENOLCK) {
- goto restart;
- }
- VNASSERT((bp->b_flags & B_DELWRI), vp,
- ("buf(%p) on dirty queue without DELWRI", bp));
-
- bremfree(bp);
- bawrite(bp);
- BO_LOCK(bo);
- goto restartsync;
- }
- }
-
- bufobj_wwait(bo, 0, 0);
- BO_UNLOCK(bo);
- vnode_pager_setsize(vp, length);
-
- return (0);
+ return 0;
}
static void
Index: sys/sys/vnode.h
===================================================================
--- sys/sys/vnode.h
+++ sys/sys/vnode.h
@@ -659,6 +659,8 @@
void vinactive(struct vnode *, struct thread *);
int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
int vtruncbuf(struct vnode *vp, off_t length, int blksize);
+void v_inval_buf_range(struct vnode *vp, off_t start, off_t end,
+ int blksize);
void vunref(struct vnode *);
void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
int vrecycle(struct vnode *vp);
Index: tests/sys/fs/Makefile
===================================================================
--- tests/sys/fs/Makefile
+++ tests/sys/fs/Makefile
@@ -1,5 +1,7 @@
# $FreeBSD$
+.include <bsd.compiler.mk>
+
PACKAGE= tests
TESTSDIR= ${TESTSBASE}/sys/fs
@@ -7,6 +9,9 @@
TESTSRC= ${SRCTOP}/contrib/netbsd-tests/fs
#TESTS_SUBDIRS+= nullfs # XXX: needs rump
+.if ${COMPILER_FEATURES:Mc++14}
+TESTS_SUBDIRS+= fusefs
+.endif
TESTS_SUBDIRS+= tmpfs
${PACKAGE}FILES+= h_funcs.subr

File Metadata

Mime Type
text/plain
Expires
Fri, Jan 16, 5:55 PM (14 h, 2 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27664580
Default Alt Text
D20940.diff (268 KB)

Event Timeline