D20940.diff
No OneTemporary
Actions

Size

268 KB

Referenced Files

None

Subscribers

None

D20940.diff
View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: UPDATING
	===================================================================
	--- UPDATING
	+++ UPDATING
	@@ -31,6 +31,18 @@
	disable the most expensive debugging functionality run
	"ln -s 'abort:false,junk:false' /etc/malloc.conf".)

	+20190627:
	+ The vfs.fusefs.sync_unmount and vfs.fusefs.init_backgrounded sysctls
	+ and the "-o sync_unmount" and "-o init_backgrounded" mount options have
	+ been removed from mount_fusefs(8). You can safely remove them from
	+ your scripts, because they had no effect.
	+
	+ The vfs.fusefs.fix_broken_io, vfs.fusefs.sync_resize,
	+ vfs.fusefs.refresh_size, vfs.fusefs.mmap_enable,
	+ vfs.fusefs.reclaim_revoked, and vfs.fusefs.data_cache_invalidate
	+ sysctls have been removed. If you felt the need to set any of them to
	+ a non-default value, please tell asomers@FreeBSD.org why.
	+
	20190620:
	Entropy collection and the /dev/random device are no longer optional
	components. The "device random" option has been removed.
	Index: etc/mtree/BSD.tests.dist
	===================================================================
	--- etc/mtree/BSD.tests.dist
	+++ etc/mtree/BSD.tests.dist
	@@ -731,6 +731,8 @@
	file
	..
	fs
	+ fusefs
	+ ..
	tmpfs
	..
	..
	Index: lib/libc/gen/getvfsbyname.c
	===================================================================
	--- lib/libc/gen/getvfsbyname.c
	+++ lib/libc/gen/getvfsbyname.c
	@@ -37,10 +37,26 @@
	#include <sys/mount.h>
	#include <sys/sysctl.h>
	#include <errno.h>
	+#include <stdbool.h>
	#include <stdlib.h>
	#include <string.h>

	/*
	+ * fusefs(5) file systems may have a "subtype" which gets appended to
	+ * statfs(2)'s f_fstypename field on a per-mount basis. Allow getvfsbyname to
	+ * match either the full "fusefs.foobar" or the more general "fusefs".
	+ */
	+static bool
	+are_fusefs(const char fsname, const char vfc_name)
	+{
	+ const static char fusefs[] = "fusefs";
	+ const static char fusefs_dot[] = "fusefs.";
	+
	+ return (strncmp(fsname, fusefs_dot, sizeof(fusefs_dot) - 1) == 0 &&
	+ strcmp(fusefs, vfc_name) == 0);
	+}
	+
	+/*
	* Given a filesystem name, determine if it is resident in the kernel,
	* and if it is resident, return its xvfsconf structure.
	*/
	@@ -62,7 +78,8 @@
	}
	cnt = buflen / sizeof(struct xvfsconf);
	for (i = 0; i < cnt; i++) {
	- if (strcmp(fsname, xvfsp[i].vfc_name) == 0) {
	+ if (strcmp(fsname, xvfsp[i].vfc_name) == 0 \|\|
	+ are_fusefs(fsname, xvfsp[i].vfc_name)) {
	memcpy(vfcp, xvfsp + i, sizeof(struct xvfsconf));
	free(xvfsp);
	return (0);
	Index: sbin/mount_fusefs/mount_fusefs.8
	===================================================================
	--- sbin/mount_fusefs/mount_fusefs.8
	+++ sbin/mount_fusefs/mount_fusefs.8
	@@ -3,6 +3,11 @@
	.\" Copyright (c) 2005, 2006 Csaba Henk
	.\" All rights reserved.
	.\"
	+.\" Copyright (c) 2019 The FreeBSD Foundation
	+.\"
	+.\" Portions of this documentation were written by BFF Storage Systems under
	+.\" sponsorship from the FreeBSD Foundation.
	+.\"
	.\" Redistribution and use in source and binary forms, with or without
	.\" modification, are permitted provided that the following conditions
	.\" are met:
	@@ -29,7 +34,7 @@
	.\"
	.\" $FreeBSD$
	.\"
	-.Dd November 17, 2018
	+.Dd July 18, 2019
	.Dt MOUNT_FUSEFS 8
	.Os
	.Sh NAME
	@@ -136,23 +141,38 @@
	by prefixing them with
	.Dq no ) :
	.Bl -tag -width indent
	-.It Cm default_permissions
	-Enable traditional (file mode based) permission checking in kernel
	.It Cm allow_other
	Do not apply
	.Sx STRICT ACCESS POLICY .
	Only root can use this option
	+.It Cm async
	+I/O to the file system may be done asynchronously.
	+Writes may delayed and/or reordered.
	+.It Cm default_permissions
	+Enable traditional (file mode based) permission checking in kernel
	+.It Cm intr
	+Allow signals to interrupt operations that are blocked waiting for a reply from the server.
	+When this option is in use, system calls may fail with
	+.Er EINTR
	+whenever a signal is received.
	.It Cm max_read Ns = Ns Ar n
	Limit size of read requests to
	.Ar n
	+.It Cm neglect_shares
	+Do not refuse unmounting if there are secondary mounts
	.It Cm private
	Refuse shared mounting of the daemon.
	This is the default behaviour, to allow sharing, expicitly use
	.Fl o Cm noprivate
	-.It Cm neglect_shares
	-Do not refuse unmounting if there are secondary mounts
	.It Cm push_symlinks_in
	Prefix absolute symlinks with the mountpoint
	+.It Cm subtype Ns = Ns Ar fsname
	+Suffix
	+.Ar fsname
	+to the file system name as reported by
	+.Xr statfs 2 .
	+This option can be used to identify the file system implemented by
	+.Ar fuse_daemon .
	.El
	.El
	.Pp
	Index: sbin/mount_fusefs/mount_fusefs.c
	===================================================================
	--- sbin/mount_fusefs/mount_fusefs.c
	+++ sbin/mount_fusefs/mount_fusefs.c
	@@ -5,6 +5,11 @@
	* Copyright (c) 2005 Csaba Henk
	* All rights reserved.
	*
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems under
	+ * sponsorship from the FreeBSD Foundation.
	+ *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	@@ -60,7 +65,6 @@
	void usage(void);
	void helpmsg(void);
	void showversion(void);
	-int init_backgrounded(void);

	static struct mntopt mopts[] = {
	#define ALTF_PRIVATE 0x01
	@@ -73,8 +77,6 @@
	{ "max_read=", 0, ALTF_MAXREAD, 1 },
	#define ALTF_SUBTYPE 0x40
	{ "subtype=", 0, ALTF_SUBTYPE, 1 },
	- #define ALTF_SYNC_UNMOUNT 0x80
	- { "sync_unmount", 0, ALTF_SYNC_UNMOUNT, 1 },
	/*
	* MOPT_AUTOMOUNTED, included by MOPT_STDOPTS, does not fit into
	* the 'flags' argument to nmount(2). We have to abuse altflags
	@@ -82,6 +84,8 @@
	*/
	#define ALTF_AUTOMOUNTED 0x100
	{ "automounted", 0, ALTF_AUTOMOUNTED, 1 },
	+ #define ALTF_INTR 0x200
	+ { "intr", 0, ALTF_INTR, 1 },
	/* Linux specific options, we silently ignore them */
	{ "fsname=", 0, 0x00, 1 },
	{ "fd=", 0, 0x00, 1 },
	@@ -91,6 +95,8 @@
	{ "large_read", 0, 0x00, 1 },
	/* "nonempty", just the first two chars are stripped off during parsing */
	{ "nempty", 0, 0x00, 1 },
	+ { "async", 0, MNT_ASYNC, 0},
	+ { "noasync", 1, MNT_ASYNC, 0},
	MOPT_STDOPTS,
	MOPT_END
	};
	@@ -107,7 +113,7 @@
	{ 0, NULL, 0 }
	};

	-#define DEFAULT_MOUNT_FLAGS ALTF_PRIVATE \| ALTF_SYNC_UNMOUNT
	+#define DEFAULT_MOUNT_FLAGS ALTF_PRIVATE

	int
	main(int argc, char *argv[])
	@@ -409,12 +415,6 @@
	}
	}

	- if (fd >= 0 && ! init_backgrounded() && close(fd) < 0) {
	- if (pid)
	- kill(pid, SIGKILL);
	- err(1, "failed to close fuse device");
	- }
	-
	/* Prepare the options vector for nmount(). build_iovec() is declared
	* in mntopts.h. */
	sprintf(fdstr, "%d", fd);
	@@ -471,6 +471,7 @@
	" -o allow_other allow access to other users\n"
	/* " -o nonempty allow mounts over non-empty file/dir\n" */
	" -o default_permissions enable permission checking by kernel\n"
	+ " -o intr interruptible mount\n"
	/*
	" -o fsname=NAME set filesystem name\n"
	" -o large_read issue large read requests (2.4 only)\n"
	@@ -481,7 +482,6 @@
	" -o neglect_shares don't report EBUSY when unmount attempted\n"
	" in presence of secondary mounts\n"
	" -o push_symlinks_in prefix absolute symlinks with mountpoint\n"
	- " -o sync_unmount do unmount synchronously\n"
	);
	exit(EX_USAGE);
	}
	@@ -491,18 +491,4 @@
	{
	puts("mount_fusefs [fuse4bsd] version: " FUSE4BSD_VERSION);
	exit(EX_USAGE);
	-}
	-
	-int
	-init_backgrounded(void)
	-{
	- int ibg;
	- size_t len;
	-
	- len = sizeof(ibg);
	-
	- if (sysctlbyname("vfs.fusefs.init_backgrounded", &ibg, &len, NULL, 0))
	- return (0);
	-
	- return (ibg);
	}
	Index: share/man/man5/fusefs.5
	===================================================================
	--- share/man/man5/fusefs.5
	+++ share/man/man5/fusefs.5
	@@ -3,8 +3,8 @@
	.\"
	.\" Copyright (c) 2019 The FreeBSD Foundation
	.\"
	-.\" This software was developed by BFF Storage Systems, LLC under sponsorship
	-.\" from the FreeBSD Foundation.
	+.\" This documentation was written by BFF Storage Systems, LLC under
	+.\" sponsorship from the FreeBSD Foundation.
	.\"
	.\" Redistribution and use in source and binary forms, with or without
	.\" modification, are permitted provided that the following conditions
	@@ -28,7 +28,7 @@
	.\" SUCH DAMAGE.
	.\"
	.\" $FreeBSD$
	-.Dd April 13, 2019
	+.Dd June 27, 2019
	.Dt FUSEFS 5
	.Os
	.Sh NAME
	@@ -60,11 +60,9 @@
	API is portable.
	Many daemons can run on multiple operating systems with minimal modifications.
	.Sh SYSCTL VARIABLES
	-The following variables are available as both
	+The following
	.Xr sysctl 8
	-variables and
	-.Xr loader 8
	-tunables:
	+variables are available:
	.Bl -tag -width indent
	.It Va vfs.fusefs.kernelabi_major
	Major version of the FUSE kernel ABI supported by this driver.
	@@ -73,7 +71,7 @@
	.It Va vfs.fusefs.data_cache_mode
	Controls how
	.Nm
	-will cache file data.
	+will cache file data for pre-7.23 file systems.
	A value of 0 will disable caching entirely.
	Every data access will be forwarded to the daemon.
	A value of 1 will select write-through caching.
	@@ -84,33 +82,25 @@
	to the daemon by the page daemon.
	Write-back caching is usually unsafe, especially for FUSE file systems that
	require network access.
	-.It Va vfs.fusefs.lookup_cache_enable
	-Controls whether
	-.Nm
	-will cache lookup responses from the file system.
	-FUSE file systems indicate whether lookup responses should be cacheable, but
	-it may be useful to globally disable caching them if a file system is
	-misbehaving.
	+.Pp
	+FUSE file systems using protocol 7.23 or later specify their cache behavior
	+on a per-mountpoint basis, ignoring this sysctl.
	+.It Va vfs.fusefs.stats.filehandle_count
	+Current number of open FUSE file handles.
	+.It Va vfs.fusefs.stats.lookup_cache_hits
	+Total number of lookup cache hits.
	+.It Va vfs.fusefs.stats.lookup_cache_misses
	+Total number of lookup cache misses.
	+.It Va vfs.fusefs.stats.node_count
	+Current number of allocated FUSE vnodes.
	+.It Va vfs.fusefs.stats.ticket_count
	+Current number of allocated FUSE tickets, which is roughly equal to the number
	+number of FUSE operations currently being processed by daemons.
	.\" Undocumented sysctls
	.\" ====================
	-.\" Counters: I intend to rename to vfs.fusefs.stats.* for clarity
	-.\" vfs.fusefs.lookup_cache_{hits, misses}
	-.\" vfs.fusefs.filehandle_count
	-.\" vfs.fusefs.ticker_count
	-.\" vfs.fusefs.node_count
	-.\"
	-.\" vfs.fusefs.version - useless since the driver moved in-tree
	-.\" vfs.fusefs.reclaim_revoked: I don't understand it well-enough
	-.\" vfs.fusefs.sync_unmount: dead code
	.\" vfs.fusefs.enforce_dev_perms: I don't understand it well enough.
	-.\" vfs.fusefs.init_backgrounded: dead code
	.\" vfs.fusefs.iov_credit: I don't understand it well enough
	.\" vfs.fusefs.iov_permanent_bufsize: I don't understand it well enough
	-.\" vfs.fusefs.fix_broken_io: I don't understand it well enough
	-.\" vfs.fusefs.sync_resize: useless and should be removed
	-.\" vfs.fusefs.refresh_size: probably useless?
	-.\" vfs.fusefs.mmap_enable: why is this optional?
	-.\" vfs.fusefs.data_cache_invalidate: what is this needed for?
	.Sh SEE ALSO
	.Xr mount_fusefs 8
	.Sh HISTORY
	Index: share/man/man9/VOP_FSYNC.9
	===================================================================
	--- share/man/man9/VOP_FSYNC.9
	+++ share/man/man9/VOP_FSYNC.9
	@@ -4,6 +4,11 @@
	.\"
	.\" All rights reserved.
	.\"
	+.\" Copyright (c) 2019 The FreeBSD Foundation
	+.\"
	+.\" Portions of this documentation were written by BFF Storage Systems under
	+.\" sponsorship from the FreeBSD Foundation.
	+.\"
	.\" This program is free software.
	.\"
	.\" Redistribution and use in source and binary forms, with or without
	Index: share/mk/bsd.compiler.mk
	===================================================================
	--- share/mk/bsd.compiler.mk
	+++ share/mk/bsd.compiler.mk
	@@ -19,6 +19,7 @@
	# COMPILER_FEATURES will contain one or more of the following, based on
	# compiler support for that feature:
	#
	+# - c++14: supports full (or nearly full) C++14 programming environment.
	# - c++11: supports full (or nearly full) C++11 programming environment.
	# - retpoline: supports the retpoline speculative execution vulnerability
	# mitigation.
	@@ -200,6 +201,10 @@
	.endif

	${X_}COMPILER_FEATURES=
	+.if ${${X_}COMPILER_TYPE} == "clang" \|\| \
	+ (${${X_}COMPILER_TYPE} == "gcc" && ${${X_}COMPILER_VERSION} >= 50000)
	+${X_}COMPILER_FEATURES+= c++14
	+.endif
	.if ${${X_}COMPILER_TYPE} == "clang" \|\| \
	(${${X_}COMPILER_TYPE} == "gcc" && ${${X_}COMPILER_VERSION} >= 40800)
	${X_}COMPILER_FEATURES+= c++11
	Index: sys/fs/fuse/fuse.h
	===================================================================
	--- sys/fs/fuse/fuse.h
	+++ sys/fs/fuse/fuse.h
	@@ -32,6 +32,11 @@
	*
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	+ *
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	@@ -63,87 +68,10 @@
	#define FUSE_MIN_DAEMON_TIMEOUT 0 /* s */
	#define FUSE_MAX_DAEMON_TIMEOUT 600 /* s */

	-#ifndef FUSE_FREEBSD_VERSION
	-#define FUSE_FREEBSD_VERSION "0.4.4"
	-#endif
	-
	-/* Mapping versions to features */
	-
	-#define FUSE_KERNELABI_GEQ(maj, min) \
	-(FUSE_KERNEL_VERSION > (maj) \|\| (FUSE_KERNEL_VERSION == (maj) && FUSE_KERNEL_MINOR_VERSION >= (min)))
	-
	-/*
	- * Appearance of new FUSE operations is not always in par with version
	- * numbering... At least, 7.3 is a sufficient condition for having
	- * FUSE_{ACCESS,CREATE}.
	- */
	-#if FUSE_KERNELABI_GEQ(7, 3)
	-#ifndef FUSE_HAS_ACCESS
	-#define FUSE_HAS_ACCESS 1
	-#endif
	-#ifndef FUSE_HAS_CREATE
	-#define FUSE_HAS_CREATE 1
	-#endif
	-#else /* FUSE_KERNELABI_GEQ(7, 3) */
	-#ifndef FUSE_HAS_ACCESS
	-#define FUSE_HAS_ACCESS 0
	-#endif
	-#ifndef FUSE_HAS_CREATE
	-#define FUSE_HAS_CREATE 0
	-#endif
	-#endif
	-
	-#if FUSE_KERNELABI_GEQ(7, 7)
	-#ifndef FUSE_HAS_GETLK
	-#define FUSE_HAS_GETLK 1
	-#endif
	-#ifndef FUSE_HAS_SETLK
	-#define FUSE_HAS_SETLK 1
	-#endif
	-#ifndef FUSE_HAS_SETLKW
	-#define FUSE_HAS_SETLKW 1
	-#endif
	-#ifndef FUSE_HAS_INTERRUPT
	-#define FUSE_HAS_INTERRUPT 1
	-#endif
	-#else /* FUSE_KERNELABI_GEQ(7, 7) */
	-#ifndef FUSE_HAS_GETLK
	-#define FUSE_HAS_GETLK 0
	-#endif
	-#ifndef FUSE_HAS_SETLK
	-#define FUSE_HAS_SETLK 0
	-#endif
	-#ifndef FUSE_HAS_SETLKW
	-#define FUSE_HAS_SETLKW 0
	-#endif
	-#ifndef FUSE_HAS_INTERRUPT
	-#define FUSE_HAS_INTERRUPT 0
	-#endif
	-#endif
	-
	-#if FUSE_KERNELABI_GEQ(7, 8)
	-#ifndef FUSE_HAS_FLUSH_RELEASE
	-#define FUSE_HAS_FLUSH_RELEASE 1
	-/*
	- * "DESTROY" came in the middle of the 7.8 era,
	- * so this is not completely exact...
	- */
	-#ifndef FUSE_HAS_DESTROY
	-#define FUSE_HAS_DESTROY 1
	-#endif
	-#endif
	-#else /* FUSE_KERNELABI_GEQ(7, 8) */
	-#ifndef FUSE_HAS_FLUSH_RELEASE
	-#define FUSE_HAS_FLUSH_RELEASE 0
	-#ifndef FUSE_HAS_DESTROY
	-#define FUSE_HAS_DESTROY 0
	-#endif
	-#endif
	-#endif
	-
	/* misc */

	SYSCTL_DECL(_vfs_fusefs);
	+SYSCTL_DECL(_vfs_fusefs_stats);

	/* Fuse locking */

	Index: sys/fs/fuse/fuse_device.c
	===================================================================
	--- sys/fs/fuse/fuse_device.c
	+++ sys/fs/fuse/fuse_device.c
	@@ -33,6 +33,11 @@
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	*
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	+ *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	@@ -81,27 +86,28 @@
	#include <sys/selinfo.h>

	#include "fuse.h"
	+#include "fuse_internal.h"
	#include "fuse_ipc.h"

	-SDT_PROVIDER_DECLARE(fuse);
	+SDT_PROVIDER_DECLARE(fusefs);
	/*
	* Fuse trace probe:
	* arg0: verbosity. Higher numbers give more verbose messages
	* arg1: Textual message
	*/
	-SDT_PROBE_DEFINE2(fuse, , device, trace, "int", "char*");
	+SDT_PROBE_DEFINE2(fusefs, , device, trace, "int", "char*");

	static struct cdev *fuse_dev;

	+static d_kqfilter_t fuse_device_filter;
	static d_open_t fuse_device_open;
	-static d_close_t fuse_device_close;
	static d_poll_t fuse_device_poll;
	static d_read_t fuse_device_read;
	static d_write_t fuse_device_write;

	static struct cdevsw fuse_device_cdevsw = {
	+ .d_kqfilter = fuse_device_filter,
	.d_open = fuse_device_open,
	- .d_close = fuse_device_close,
	.d_name = "fuse",
	.d_poll = fuse_device_poll,
	.d_read = fuse_device_read,
	@@ -109,6 +115,15 @@
	.d_version = D_VERSION,
	};

	+static int fuse_device_filt_read(struct knote *kn, long hint);
	+static void fuse_device_filt_detach(struct knote *kn);
	+
	+struct filterops fuse_device_rfiltops = {
	+ .f_isfd = 1,
	+ .f_detach = fuse_device_filt_detach,
	+ .f_event = fuse_device_filt_read,
	+};
	+
	/****************************
	*
	* >>> Fuse device op defs
	@@ -119,11 +134,100 @@
	fdata_dtor(void *arg)
	{
	struct fuse_data *fdata;
	+ struct fuse_ticket *tick;

	fdata = arg;
	+ if (fdata == NULL)
	+ return;
	+
	+ fdata_set_dead(fdata);
	+
	+ FUSE_LOCK();
	+ fuse_lck_mtx_lock(fdata->aw_mtx);
	+ /* wakup poll()ers */
	+ selwakeuppri(&fdata->ks_rsel, PZERO + 1);
	+ /* Don't let syscall handlers wait in vain */
	+ while ((tick = fuse_aw_pop(fdata))) {
	+ fuse_lck_mtx_lock(tick->tk_aw_mtx);
	+ fticket_set_answered(tick);
	+ tick->tk_aw_errno = ENOTCONN;
	+ wakeup(tick);
	+ fuse_lck_mtx_unlock(tick->tk_aw_mtx);
	+ FUSE_ASSERT_AW_DONE(tick);
	+ fuse_ticket_drop(tick);
	+ }
	+ fuse_lck_mtx_unlock(fdata->aw_mtx);
	+
	+ /* Cleanup unsent operations */
	+ fuse_lck_mtx_lock(fdata->ms_mtx);
	+ while ((tick = fuse_ms_pop(fdata))) {
	+ fuse_ticket_drop(tick);
	+ }
	+ fuse_lck_mtx_unlock(fdata->ms_mtx);
	+ FUSE_UNLOCK();
	+
	fdata_trydestroy(fdata);
	}

	+static int
	+fuse_device_filter(struct cdev dev, struct knote kn)
	+{
	+ struct fuse_data *data;
	+ int error;
	+
	+ error = devfs_get_cdevpriv((void **)&data);
	+
	+ /* EVFILT_WRITE is not supported; the device is always ready to write */
	+ if (error == 0 && kn->kn_filter == EVFILT_READ) {
	+ kn->kn_fop = &fuse_device_rfiltops;
	+ kn->kn_hook = data;
	+ knlist_add(&data->ks_rsel.si_note, kn, 0);
	+ error = 0;
	+ } else if (error == 0) {
	+ error = EINVAL;
	+ kn->kn_data = error;
	+ }
	+
	+ return (error);
	+}
	+
	+static void
	+fuse_device_filt_detach(struct knote *kn)
	+{
	+ struct fuse_data *data;
	+
	+ data = (struct fuse_data*)kn->kn_hook;
	+ MPASS(data != NULL);
	+ knlist_remove(&data->ks_rsel.si_note, kn, 0);
	+ kn->kn_hook = NULL;
	+}
	+
	+static int
	+fuse_device_filt_read(struct knote *kn, long hint)
	+{
	+ struct fuse_data *data;
	+ int ready;
	+
	+ data = (struct fuse_data*)kn->kn_hook;
	+ MPASS(data != NULL);
	+
	+ mtx_assert(&data->ms_mtx, MA_OWNED);
	+ if (fdata_get_dead(data)) {
	+ kn->kn_flags \|= EV_EOF;
	+ kn->kn_fflags = ENODEV;
	+ kn->kn_data = 1;
	+ ready = 1;
	+ } else if (STAILQ_FIRST(&data->ms_head)) {
	+ MPASS(data->ms_count >= 1);
	+ kn->kn_data = data->ms_count;
	+ ready = 1;
	+ } else {
	+ ready = 0;
	+ }
	+
	+ return (ready);
	+}
	+
	/*
	* Resources are set up on a per-open basis
	*/
	@@ -133,52 +237,17 @@
	struct fuse_data *fdata;
	int error;

	- SDT_PROBE2(fuse, , device, trace, 1, "device open");
	+ SDT_PROBE2(fusefs, , device, trace, 1, "device open");

	fdata = fdata_alloc(dev, td->td_ucred);
	error = devfs_set_cdevpriv(fdata, fdata_dtor);
	if (error != 0)
	fdata_trydestroy(fdata);
	else
	- SDT_PROBE2(fuse, , device, trace, 1, "device open success");
	+ SDT_PROBE2(fusefs, , device, trace, 1, "device open success");
	return (error);
	}

	-static int
	-fuse_device_close(struct cdev dev, int fflag, int devtype, struct thread td)
	-{
	- struct fuse_data *data;
	- struct fuse_ticket *tick;
	- int error;
	-
	- error = devfs_get_cdevpriv((void **)&data);
	- if (error != 0)
	- return (error);
	- if (!data)
	- panic("no fuse data upon fuse device close");
	- fdata_set_dead(data);
	-
	- FUSE_LOCK();
	- fuse_lck_mtx_lock(data->aw_mtx);
	- /* wakup poll()ers */
	- selwakeuppri(&data->ks_rsel, PZERO + 1);
	- /* Don't let syscall handlers wait in vain */
	- while ((tick = fuse_aw_pop(data))) {
	- fuse_lck_mtx_lock(tick->tk_aw_mtx);
	- fticket_set_answered(tick);
	- tick->tk_aw_errno = ENOTCONN;
	- wakeup(tick);
	- fuse_lck_mtx_unlock(tick->tk_aw_mtx);
	- FUSE_ASSERT_AW_DONE(tick);
	- fuse_ticket_drop(tick);
	- }
	- fuse_lck_mtx_unlock(data->aw_mtx);
	- FUSE_UNLOCK();
	-
	- SDT_PROBE2(fuse, , device, trace, 1, "device close");
	- return (0);
	-}
	-
	int
	fuse_device_poll(struct cdev dev, int events, struct thread td)
	{
	@@ -219,7 +288,7 @@
	int buflen[3];
	int i;

	- SDT_PROBE2(fuse, , device, trace, 1, "fuse device read");
	+ SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read");

	err = devfs_get_cdevpriv((void **)&data);
	if (err != 0)
	@@ -228,7 +297,7 @@
	fuse_lck_mtx_lock(data->ms_mtx);
	again:
	if (fdata_get_dead(data)) {
	- SDT_PROBE2(fuse, , device, trace, 2,
	+ SDT_PROBE2(fusefs, , device, trace, 2,
	"we know early on that reader should be kicked so we "
	"don't wait for news");
	fuse_lck_mtx_unlock(data->ms_mtx);
	@@ -256,7 +325,7 @@
	* -- and some other cases, too, tho not totally clear, when
	* (cv_signal/wakeup_one signals the whole process ?)
	*/
	- SDT_PROBE2(fuse, , device, trace, 1, "no message on thread");
	+ SDT_PROBE2(fusefs, , device, trace, 1, "no message on thread");
	goto again;
	}
	fuse_lck_mtx_unlock(data->ms_mtx);
	@@ -266,9 +335,10 @@
	* somebody somewhere -- eg., umount routine --
	* wants this liaison finished off
	*/
	- SDT_PROBE2(fuse, , device, trace, 2, "reader is to be sacked");
	+ SDT_PROBE2(fusefs, , device, trace, 2,
	+ "reader is to be sacked");
	if (tick) {
	- SDT_PROBE2(fuse, , device, trace, 2, "weird -- "
	+ SDT_PROBE2(fusefs, , device, trace, 2, "weird -- "
	"\"kick\" is set tho there is message");
	FUSE_ASSERT_MS_DONE(tick);
	fuse_ticket_drop(tick);
	@@ -276,7 +346,7 @@
	return (ENODEV); /* This should make the daemon get off
	* of us */
	}
	- SDT_PROBE2(fuse, , device, trace, 1,
	+ SDT_PROBE2(fusefs, , device, trace, 1,
	"fuse device read message successfully");

	KASSERT(tick->tk_ms_bufdata \|\| tick->tk_ms_bufsize == 0,
	@@ -311,7 +381,7 @@
	*/
	if (uio->uio_resid < buflen[i]) {
	fdata_set_dead(data);
	- SDT_PROBE2(fuse, , device, trace, 2,
	+ SDT_PROBE2(fusefs, , device, trace, 2,
	"daemon is stupid, kick it off...");
	err = ENODEV;
	break;
	@@ -331,23 +401,26 @@
	fuse_ohead_audit(struct fuse_out_header ohead, struct uio uio)
	{
	if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) {
	- SDT_PROBE2(fuse, , device, trace, 1, "Format error: body size "
	+ SDT_PROBE2(fusefs, , device, trace, 1,
	+ "Format error: body size "
	"differs from size claimed by header");
	return (EINVAL);
	}
	- if (uio->uio_resid && ohead->error) {
	- SDT_PROBE2(fuse, , device, trace, 1,
	+ if (uio->uio_resid && ohead->unique != 0 && ohead->error) {
	+ SDT_PROBE2(fusefs, , device, trace, 1,
	"Format error: non zero error but message had a body");
	return (EINVAL);
	}
	- /* Sanitize the linuxism of negative errnos */
	- ohead->error = -(ohead->error);

	return (0);
	}

	-SDT_PROBE_DEFINE1(fuse, , device, fuse_device_write_bumped_into_callback,
	- "uint64_t");
	+SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_notify,
	+ "struct fuse_out_header*");
	+SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_missing_ticket,
	+ "uint64_t");
	+SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_found,
	+ "struct fuse_ticket*");
	/*
	* fuse_device_write first reads the header sent by the daemon.
	* If that's OK, looks up ticket/callback node by the unique id seen in header.
	@@ -360,15 +433,17 @@
	struct fuse_out_header ohead;
	int err = 0;
	struct fuse_data *data;
	- struct fuse_ticket tick, x_tick;
	+ struct mount *mp;
	+ struct fuse_ticket tick, itick, *x_tick;
	int found = 0;

	err = devfs_get_cdevpriv((void **)&data);
	if (err != 0)
	return (err);
	+ mp = data->mp;

	if (uio->uio_resid < sizeof(struct fuse_out_header)) {
	- SDT_PROBE2(fuse, , device, trace, 1,
	+ SDT_PROBE2(fusefs, , device, trace, 1,
	"fuse_device_write got less than a header!");
	fdata_set_dead(data);
	return (EINVAL);
	@@ -393,15 +468,29 @@
	fuse_lck_mtx_lock(data->aw_mtx);
	TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link,
	x_tick) {
	- SDT_PROBE1(fuse, , device,
	- fuse_device_write_bumped_into_callback,
	- tick->tk_unique);
	if (tick->tk_unique == ohead.unique) {
	+ SDT_PROBE1(fusefs, , device, fuse_device_write_found,
	+ tick);
	found = 1;
	fuse_aw_remove(tick);
	break;
	}
	}
	+ if (found && tick->irq_unique > 0) {
	+ /*
	+ * Discard the FUSE_INTERRUPT ticket that tried to interrupt
	+ * this operation
	+ */
	+ TAILQ_FOREACH_SAFE(itick, &data->aw_head, tk_aw_link,
	+ x_tick) {
	+ if (itick->tk_unique == tick->irq_unique) {
	+ fuse_aw_remove(itick);
	+ fuse_ticket_drop(itick);
	+ break;
	+ }
	+ }
	+ tick->irq_unique = 0;
	+ }
	fuse_lck_mtx_unlock(data->aw_mtx);

	if (found) {
	@@ -414,13 +503,15 @@
	* via ticket_drop(), so no manual mucking
	* around...)
	*/
	- SDT_PROBE2(fuse, , device, trace, 1,
	+ SDT_PROBE2(fusefs, , device, trace, 1,
	"pass ticket to a callback");
	+ /* Sanitize the linuxism of negative errnos */
	+ ohead.error *= -1;
	memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead));
	err = tick->tk_aw_handler(tick, uio);
	} else {
	/* pretender doesn't wanna do anything with answer */
	- SDT_PROBE2(fuse, , device, trace, 1,
	+ SDT_PROBE2(fusefs, , device, trace, 1,
	"stuff devalidated, so we drop it");
	}

	@@ -430,11 +521,51 @@
	* because fuse_ticket_drop() will deal with refcount anyway.
	*/
	fuse_ticket_drop(tick);
	+ } else if (ohead.unique == 0){
	+ /* unique == 0 means asynchronous notification */
	+ SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead);
	+ switch (ohead.error) {
	+ case FUSE_NOTIFY_INVAL_ENTRY:
	+ err = fuse_internal_invalidate_entry(mp, uio);
	+ break;
	+ case FUSE_NOTIFY_INVAL_INODE:
	+ err = fuse_internal_invalidate_inode(mp, uio);
	+ break;
	+ case FUSE_NOTIFY_RETRIEVE:
	+ case FUSE_NOTIFY_STORE:
	+ /*
	+ * Unimplemented. I don't know of any file systems
	+ * that use them, and the protocol isn't sound anyway,
	+ * since the notification messages don't include the
	+ * inode's generation number. Without that, it's
	+ * possible to manipulate the cache of the wrong vnode.
	+ * Finally, it's not defined what this message should
	+ * do for a file with dirty cache.
	+ */
	+ case FUSE_NOTIFY_POLL:
	+ /* Unimplemented. See comments in fuse_vnops */
	+ default:
	+ /* Not implemented */
	+ err = ENOSYS;
	+ }
	} else {
	/* no callback at all! */
	- SDT_PROBE2(fuse, , device, trace, 1,
	- "erhm, no handler for this response");
	- err = EINVAL;
	+ SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket,
	+ ohead.unique);
	+ if (ohead.error == -EAGAIN) {
	+ /*
	+ * This was probably a response to a FUSE_INTERRUPT
	+ * operation whose original operation is already
	+ * complete. We can't store FUSE_INTERRUPT tickets
	+ * indefinitely because their responses are optional.
	+ * So we delete them when the original operation
	+ * completes. And sadly the fuse_header_out doesn't
	+ * identify the opcode, so we have to guess.
	+ */
	+ err = 0;
	+ } else {
	+ err = EINVAL;
	+ }
	}

	return (err);
	@@ -445,7 +576,7 @@
	{

	fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR,
	- S_IRUSR \| S_IWUSR \| S_IRGRP \| S_IWGRP, "fuse");
	+ S_IRUSR \| S_IWUSR \| S_IRGRP \| S_IWGRP \| S_IROTH \| S_IWOTH, "fuse");
	if (fuse_dev == NULL)
	return (ENOMEM);
	return (0);
	Index: sys/fs/fuse/fuse_file.h
	===================================================================
	--- sys/fs/fuse/fuse_file.h
	+++ sys/fs/fuse/fuse_file.h
	@@ -32,6 +32,11 @@
	*
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	+ *
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	@@ -66,52 +71,115 @@
	#include <sys/mman.h>
	#include <sys/vnode.h>

	+/*
	+ * The fufh type is the access mode of the fuse file handle. It's the portion
	+ * of the open(2) flags related to permission.
	+ */
	typedef enum fufh_type {
	FUFH_INVALID = -1,
	- FUFH_RDONLY = 0,
	- FUFH_WRONLY = 1,
	- FUFH_RDWR = 2,
	- FUFH_MAXTYPE = 3,
	+ FUFH_RDONLY = O_RDONLY,
	+ FUFH_WRONLY = O_WRONLY,
	+ FUFH_RDWR = O_RDWR,
	+ FUFH_EXEC = O_EXEC,
	} fufh_type_t;
	-_Static_assert(FUFH_RDONLY == O_RDONLY, "RDONLY");
	-_Static_assert(FUFH_WRONLY == O_WRONLY, "WRONLY");
	-_Static_assert(FUFH_RDWR == O_RDWR, "RDWR");

	+/*
	+ * FUSE File Handles
	+ *
	+ * The FUSE protocol says that a server may assign a unique 64-bit file handle
	+ * every time that a file is opened. Effectively, that's once for each file
	+ * descriptor.
	+ *
	+ * Unfortunately, the VFS doesn't help us here. VOPs don't have a
	+ * struct file* argument. fileops do, but many syscalls bypass the fileops
	+ * layer and go straight to a vnode. Some, like writing from cache, can't
	+ * track a file handle even in theory. The entire concept of the file handle
	+ * is a product of FUSE's Linux origins; Linux lacks vnodes and almost every
	+ * file system operation takes a struct file* argument.
	+ *
	+ * Since FreeBSD's VFS is more file descriptor-agnostic, we must store FUSE
	+ * filehandles in the vnode. One option would be to only store a single file
	+ * handle and never open FUSE files concurrently. That's what NetBSD does.
	+ * But that violates FUSE's security model. FUSE expects the server to do all
	+ * authorization (except when mounted with -o default_permissions). In order
	+ * to do that, the server needs us to send FUSE_OPEN every time somebody opens
	+ * a new file descriptor.
	+ *
	+ * Another option would be to never open FUSE files concurrently, but send a
	+ * FUSE_ACCESS prior to every open after the first. That would give the server
	+ * the opportunity to authorize the access. Unfortunately, the FUSE protocol
	+ * makes ACCESS optional. File systems that don't implement it are assumed to
	+ * authorize everything. A survey of 32 fuse file systems showed that only 14
	+ * implemented access. Among the laggards were a few that really ought to be
	+ * doing server-side authorization.
	+ *
	+ * So we do something hacky, similar to what OpenBSD, Illumos, and OSXFuse do.
	+ * we store a list of file handles, one for each combination of vnode, uid,
	+ * gid, pid, and access mode. When opening a file, we first check whether
	+ * there's already a matching file handle. If so, we reuse it. If not, we
	+ * send FUSE_OPEN and create a new file handle. That minimizes the number of
	+ * open file handles while still allowing the server to authorize stuff.
	+ *
	+ * VOPs that need a file handle search through the list for a close match.
	+ * They can't be guaranteed of finding an exact match because, for example, a
	+ * process may have changed its UID since opening the file. Also, most VOPs
	+ * don't know exactly what permission they need. Is O_RDWR required or is
	+ * O_RDONLY good enough? So the file handle we end up using may not be exactly
	+ * the one we're supposed to use with that file descriptor. But if the FUSE
	+ * file system isn't too picky, it will work. (FWIW even Linux sometimes
	+ * guesses the file handle, during writes from cache or most SETATTR
	+ * operations).
	+ *
	+ * I suspect this mess is part of the reason why neither NFS nor 9P have an
	+ * equivalent of FUSE file handles.
	+ */
	struct fuse_filehandle {
	+ LIST_ENTRY(fuse_filehandle) next;
	+
	+ /* The filehandle returned by FUSE_OPEN */
	uint64_t fh_id;
	- fufh_type_t fh_type;
	-};

	-#define FUFH_IS_VALID(f) ((f)->fh_type != FUFH_INVALID)
	+ /*
	+ * flags returned by FUSE_OPEN
	+ * Supported flags: FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE
	+ * Unsupported:
	+ * FOPEN_NONSEEKABLE: Adding support would require a new per-file
	+ * or per-vnode attribute, which would have to be checked by
	+ * kern_lseek (and others) for every file system. The benefit is
	+ * dubious, since I'm unaware of any file systems in ports that use
	+ * this flag.
	+ */
	+ uint32_t fuse_open_flags;

	-static inline fufh_type_t
	-fuse_filehandle_xlate_from_mmap(int fflags)
	-{
	- if (fflags & (PROT_READ \| PROT_WRITE))
	- return FUFH_RDWR;
	- else if (fflags & (PROT_WRITE))
	- return FUFH_WRONLY;
	- else if ((fflags & PROT_READ) \|\| (fflags & PROT_EXEC))
	- return FUFH_RDONLY;
	- else
	- return FUFH_INVALID;
	-}
	+ /* The access mode of the file handle */
	+ fufh_type_t fufh_type;

	-static inline fufh_type_t
	-fuse_filehandle_xlate_from_fflags(int fflags)
	-{
	- if ((fflags & FREAD) && (fflags & FWRITE))
	- return FUFH_RDWR;
	- else if (fflags & (FWRITE))
	- return FUFH_WRONLY;
	- else if (fflags & (FREAD))
	- return FUFH_RDONLY;
	- else
	- panic("FUSE: What kind of a flag is this (%x)?", fflags);
	-}
	+ /* Credentials used to open the file */
	+ gid_t gid;
	+ pid_t pid;
	+ uid_t uid;
	+};

	+#define FUFH_IS_VALID(f) ((f)->fufh_type != FUFH_INVALID)
	+
	+/*
	+ * Get the flags to use for FUSE_CREATE, FUSE_OPEN and FUSE_RELEASE
	+ *
	+ * These are supposed to be the same as the flags argument to open(2).
	+ * However, since we can't reliably associate a fuse_filehandle with a specific
	+ * file descriptor it would would be dangerous to include anything more than
	+ * the access mode flags. For example, suppose we open a file twice, once with
	+ * O_APPEND and once without. Then the user pwrite(2)s to offset using the
	+ * second file descriptor. If fusefs uses the first file handle, then the
	+ * server may append the write to the end of the file rather than at offset 0.
	+ * To prevent problems like this, we only ever send the portion of flags
	+ * related to access mode.
	+ *
	+ * It's essential to send that portion, because FUSE uses it for server-side
	+ * authorization.
	+ */
	static inline int
	-fuse_filehandle_xlate_to_oflags(fufh_type_t type)
	+fufh_type_2_fflags(fufh_type_t type)
	{
	int oflags = -1;

	@@ -119,6 +187,7 @@
	case FUFH_RDONLY:
	case FUFH_WRONLY:
	case FUFH_RDWR:
	+ case FUFH_EXEC:
	oflags = type;
	break;
	default:
	@@ -128,19 +197,28 @@
	return oflags;
	}

	-int fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type);
	-fufh_type_t fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type);
	-int fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type,
	- struct fuse_filehandle **fufhp);
	-int fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type,
	- struct fuse_filehandle **fufhp);
	+bool fuse_filehandle_validrw(struct vnode *vp, int mode,
	+ struct ucred *cred, pid_t pid);
	+int fuse_filehandle_get(struct vnode *vp, int fflag,
	+ struct fuse_filehandle *fufhp, struct ucred cred,
	+ pid_t pid);
	+int fuse_filehandle_get_anyflags(struct vnode *vp,
	+ struct fuse_filehandle *fufhp, struct ucred cred,
	+ pid_t pid);
	+int fuse_filehandle_getrw(struct vnode *vp, int fflag,
	+ struct fuse_filehandle *fufhp, struct ucred cred,
	+ pid_t pid);

	void fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type,
	- struct fuse_filehandle **fufhp, uint64_t fh_id);
	-int fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type,
	+ struct fuse_filehandle *fufhp, struct thread td,
	+ struct ucred cred, struct fuse_open_out foo);
	+int fuse_filehandle_open(struct vnode *vp, int mode,
	struct fuse_filehandle *fufhp, struct thread td,
	struct ucred *cred);
	-int fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type,
	+int fuse_filehandle_close(struct vnode vp, struct fuse_filehandle fufh,
	struct thread td, struct ucred cred);
	+
	+void fuse_file_init(void);
	+void fuse_file_destroy(void);

	#endif /* _FUSE_FILE_H_ */
	Index: sys/fs/fuse/fuse_file.c
	===================================================================
	--- sys/fs/fuse/fuse_file.c
	+++ sys/fs/fuse/fuse_file.c
	@@ -33,6 +33,11 @@
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	*
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	+ *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	@@ -59,8 +64,9 @@
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	-#include <sys/module.h>
	#include <sys/systm.h>
	+#include <sys/counter.h>
	+#include <sys/module.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/conf.h>
	@@ -79,52 +85,61 @@
	#include "fuse.h"
	#include "fuse_file.h"
	#include "fuse_internal.h"
	+#include "fuse_io.h"
	#include "fuse_ipc.h"
	#include "fuse_node.h"

	-SDT_PROVIDER_DECLARE(fuse);
	+MALLOC_DEFINE(M_FUSE_FILEHANDLE, "fuse_filefilehandle", "FUSE file handle");
	+
	+SDT_PROVIDER_DECLARE(fusefs);
	/*
	* Fuse trace probe:
	* arg0: verbosity. Higher numbers give more verbose messages
	* arg1: Textual message
	*/
	-SDT_PROBE_DEFINE2(fuse, , file, trace, "int", "char*");
	+SDT_PROBE_DEFINE2(fusefs, , file, trace, "int", "char*");

	-static int fuse_fh_count = 0;
	+static counter_u64_t fuse_fh_count;

	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, filehandle_count, CTLFLAG_RD,
	- &fuse_fh_count, 0, "number of open FUSE filehandles");
	+SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, filehandle_count, CTLFLAG_RD,
	+ &fuse_fh_count, "number of open FUSE filehandles");

	+/* Get the FUFH type for a particular access mode */
	+static inline fufh_type_t
	+fflags_2_fufh_type(int fflags)
	+{
	+ if ((fflags & FREAD) && (fflags & FWRITE))
	+ return FUFH_RDWR;
	+ else if (fflags & (FWRITE))
	+ return FUFH_WRONLY;
	+ else if (fflags & (FREAD))
	+ return FUFH_RDONLY;
	+ else if (fflags & (FEXEC))
	+ return FUFH_EXEC;
	+ else
	+ panic("FUSE: What kind of a flag is this (%x)?", fflags);
	+}
	+
	int
	-fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type,
	+fuse_filehandle_open(struct vnode *vp, int a_mode,
	struct fuse_filehandle *fufhp, struct thread td, struct ucred *cred)
	{
	struct fuse_dispatcher fdi;
	struct fuse_open_in *foi;
	struct fuse_open_out *foo;
	+ fufh_type_t fufh_type;

	int err = 0;
	int oflags = 0;
	int op = FUSE_OPEN;

	- if (fuse_filehandle_valid(vp, fufh_type)) {
	- panic("FUSE: filehandle_open called despite valid fufh (type=%d)",
	- fufh_type);
	- /* NOTREACHED */
	- }
	- /*
	- * Note that this means we are effectively FILTERING OUT open() flags.
	- */
	- oflags = fuse_filehandle_xlate_to_oflags(fufh_type);
	+ fufh_type = fflags_2_fufh_type(a_mode);
	+ oflags = fufh_type_2_fflags(fufh_type);

	if (vnode_isdir(vp)) {
	op = FUSE_OPENDIR;
	- if (fufh_type != FUFH_RDONLY) {
	- SDT_PROBE2(fuse, , file, trace, 1,
	- "non-rdonly fh requested for a directory?");
	- printf("FUSE:non-rdonly fh requested for a directory?\n");
	- fufh_type = FUFH_RDONLY;
	- }
	+ /* vn_open_vnode already rejects FWRITE on directories */
	+ MPASS(fufh_type == FUFH_RDONLY \|\| fufh_type == FUFH_EXEC);
	}
	fdisp_init(&fdi, sizeof(*foi));
	fdisp_make_vp(&fdi, op, vp, td, cred);
	@@ -133,7 +148,7 @@
	foi->flags = oflags;

	if ((err = fdisp_wait_answ(&fdi))) {
	- SDT_PROBE2(fuse, , file, trace, 1,
	+ SDT_PROBE2(fusefs, , file, trace, 1,
	"OUCH ... daemon didn't give fh");
	if (err == ENOENT) {
	fuse_internal_vnode_disappear(vp);
	@@ -142,42 +157,24 @@
	}
	foo = fdi.answ;

	- fuse_filehandle_init(vp, fufh_type, fufhp, foo->fh);
	+ fuse_filehandle_init(vp, fufh_type, fufhp, td, cred, foo);
	+ fuse_vnode_open(vp, foo->open_flags, td);

	- /*
	- * For WRONLY opens, force DIRECT_IO. This is necessary
	- * since writing a partial block through the buffer cache
	- * will result in a read of the block and that read won't
	- * be allowed by the WRONLY open.
	- */
	- if (fufh_type == FUFH_WRONLY)
	- fuse_vnode_open(vp, foo->open_flags \| FOPEN_DIRECT_IO, td);
	- else
	- fuse_vnode_open(vp, foo->open_flags, td);
	-
	out:
	fdisp_destroy(&fdi);
	return err;
	}

	int
	-fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type,
	+fuse_filehandle_close(struct vnode vp, struct fuse_filehandle fufh,
	struct thread td, struct ucred cred)
	{
	struct fuse_dispatcher fdi;
	struct fuse_release_in *fri;
	- struct fuse_vnode_data *fvdat = VTOFUD(vp);
	- struct fuse_filehandle *fufh = NULL;

	int err = 0;
	int op = FUSE_RELEASE;

	- fufh = &(fvdat->fufh[fufh_type]);
	- if (!FUFH_IS_VALID(fufh)) {
	- panic("FUSE: filehandle_put called on invalid fufh (type=%d)",
	- fufh_type);
	- /* NOTREACHED */
	- }
	if (fuse_isdeadfs(vp)) {
	goto out;
	}
	@@ -187,96 +184,194 @@
	fdisp_make_vp(&fdi, op, vp, td, cred);
	fri = fdi.indata;
	fri->fh = fufh->fh_id;
	- fri->flags = fuse_filehandle_xlate_to_oflags(fufh_type);
	+ fri->flags = fufh_type_2_fflags(fufh->fufh_type);
	+ /*
	+ * If the file has a POSIX lock then we're supposed to set lock_owner.
	+ * If not, then lock_owner is undefined. So we may as well always set
	+ * it.
	+ */
	+ fri->lock_owner = td->td_proc->p_pid;

	err = fdisp_wait_answ(&fdi);
	fdisp_destroy(&fdi);

	out:
	- atomic_subtract_acq_int(&fuse_fh_count, 1);
	- fufh->fh_id = (uint64_t)-1;
	- fufh->fh_type = FUFH_INVALID;
	+ counter_u64_add(fuse_fh_count, -1);
	+ LIST_REMOVE(fufh, next);
	+ free(fufh, M_FUSE_FILEHANDLE);

	return err;
	}

	-int
	-fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type)
	-{
	- struct fuse_vnode_data *fvdat = VTOFUD(vp);
	- struct fuse_filehandle *fufh;
	-
	- fufh = &(fvdat->fufh[fufh_type]);
	- return FUFH_IS_VALID(fufh);
	-}
	-
	/*
	* Check for a valid file handle, first the type requested, but if that
	* isn't valid, try for FUFH_RDWR.
	- * Return the FUFH type that is valid or FUFH_INVALID if there are none.
	- * This is a variant of fuse_filehandle_vaild() analogous to
	- * fuse_filehandle_getrw().
	+ * Return true if there is any file handle with the correct credentials and
	+ * a fufh type that includes the provided one.
	+ * A pid of 0 means "don't care"
	*/
	-fufh_type_t
	-fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type)
	+bool
	+fuse_filehandle_validrw(struct vnode *vp, int mode,
	+ struct ucred *cred, pid_t pid)
	{
	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	struct fuse_filehandle *fufh;
	+ fufh_type_t fufh_type = fflags_2_fufh_type(mode);

	- fufh = &fvdat->fufh[fufh_type];
	- if (FUFH_IS_VALID(fufh) != 0)
	- return (fufh_type);
	- fufh = &fvdat->fufh[FUFH_RDWR];
	- if (FUFH_IS_VALID(fufh) != 0)
	- return (FUFH_RDWR);
	- return (FUFH_INVALID);
	+ /*
	+ * Unlike fuse_filehandle_get, we want to search for a filehandle with
	+ * the exact cred, and no fallback
	+ */
	+ LIST_FOREACH(fufh, &fvdat->handles, next) {
	+ if (fufh->fufh_type == fufh_type &&
	+ fufh->uid == cred->cr_uid &&
	+ fufh->gid == cred->cr_rgid &&
	+ (pid == 0 \|\| fufh->pid == pid))
	+ return true;
	+ }
	+
	+ if (fufh_type == FUFH_EXEC)
	+ return false;
	+
	+ /* Fallback: find a RDWR list entry with the right cred */
	+ LIST_FOREACH(fufh, &fvdat->handles, next) {
	+ if (fufh->fufh_type == FUFH_RDWR &&
	+ fufh->uid == cred->cr_uid &&
	+ fufh->gid == cred->cr_rgid &&
	+ (pid == 0 \|\| fufh->pid == pid))
	+ return true;
	+ }
	+
	+ return false;
	}

	int
	-fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type,
	- struct fuse_filehandle **fufhp)
	+fuse_filehandle_get(struct vnode *vp, int fflag,
	+ struct fuse_filehandle *fufhp, struct ucred cred, pid_t pid)
	{
	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	struct fuse_filehandle *fufh;
	+ fufh_type_t fufh_type;

	- fufh = &(fvdat->fufh[fufh_type]);
	- if (!FUFH_IS_VALID(fufh))
	+ fufh_type = fflags_2_fufh_type(fflag);
	+ /* cred can be NULL for in-kernel clients */
	+ if (cred == NULL)
	+ goto fallback;
	+
	+ LIST_FOREACH(fufh, &fvdat->handles, next) {
	+ if (fufh->fufh_type == fufh_type &&
	+ fufh->uid == cred->cr_uid &&
	+ fufh->gid == cred->cr_rgid &&
	+ (pid == 0 \|\| fufh->pid == pid))
	+ goto found;
	+ }
	+
	+fallback:
	+ /* Fallback: find a list entry with the right flags */
	+ LIST_FOREACH(fufh, &fvdat->handles, next) {
	+ if (fufh->fufh_type == fufh_type)
	+ break;
	+ }
	+
	+ if (fufh == NULL)
	return EBADF;
	+
	+found:
	if (fufhp != NULL)
	*fufhp = fufh;
	return 0;
	}

	+/* Get a file handle with any kind of flags */
	int
	-fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type,
	- struct fuse_filehandle **fufhp)
	+fuse_filehandle_get_anyflags(struct vnode *vp,
	+ struct fuse_filehandle *fufhp, struct ucred cred, pid_t pid)
	{
	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	struct fuse_filehandle *fufh;

	- fufh = &(fvdat->fufh[fufh_type]);
	- if (!FUFH_IS_VALID(fufh)) {
	- fufh_type = FUFH_RDWR;
	+ if (cred == NULL)
	+ goto fallback;
	+
	+ LIST_FOREACH(fufh, &fvdat->handles, next) {
	+ if (fufh->uid == cred->cr_uid &&
	+ fufh->gid == cred->cr_rgid &&
	+ (pid == 0 \|\| fufh->pid == pid))
	+ goto found;
	}
	- return fuse_filehandle_get(vp, fufh_type, fufhp);
	+
	+fallback:
	+ /* Fallback: find any list entry */
	+ fufh = LIST_FIRST(&fvdat->handles);
	+
	+ if (fufh == NULL)
	+ return EBADF;
	+
	+found:
	+ if (fufhp != NULL)
	+ *fufhp = fufh;
	+ return 0;
	}

	+int
	+fuse_filehandle_getrw(struct vnode *vp, int fflag,
	+ struct fuse_filehandle *fufhp, struct ucred cred, pid_t pid)
	+{
	+ int err;
	+
	+ err = fuse_filehandle_get(vp, fflag, fufhp, cred, pid);
	+ if (err)
	+ err = fuse_filehandle_get(vp, FREAD \| FWRITE, fufhp, cred, pid);
	+ return err;
	+}
	+
	void
	fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type,
	- struct fuse_filehandle **fufhp, uint64_t fh_id)
	+ struct fuse_filehandle *fufhp, struct thread td, struct ucred *cred,
	+ struct fuse_open_out *foo)
	{
	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	struct fuse_filehandle *fufh;

	- fufh = &(fvdat->fufh[fufh_type]);
	- MPASS(!FUFH_IS_VALID(fufh));
	- fufh->fh_id = fh_id;
	- fufh->fh_type = fufh_type;
	+ fufh = malloc(sizeof(struct fuse_filehandle), M_FUSE_FILEHANDLE,
	+ M_WAITOK);
	+ MPASS(fufh != NULL);
	+ fufh->fh_id = foo->fh;
	+ fufh->fufh_type = fufh_type;
	+ fufh->gid = cred->cr_rgid;
	+ fufh->uid = cred->cr_uid;
	+ fufh->pid = td->td_proc->p_pid;
	+ fufh->fuse_open_flags = foo->open_flags;
	if (!FUFH_IS_VALID(fufh)) {
	panic("FUSE: init: invalid filehandle id (type=%d)", fufh_type);
	}
	+ LIST_INSERT_HEAD(&fvdat->handles, fufh, next);
	if (fufhp != NULL)
	*fufhp = fufh;

	- atomic_add_acq_int(&fuse_fh_count, 1);
	+ counter_u64_add(fuse_fh_count, 1);
	+
	+ if (foo->open_flags & FOPEN_DIRECT_IO) {
	+ ASSERT_VOP_ELOCKED(vp, __func__);
	+ VTOFUD(vp)->flag \|= FN_DIRECTIO;
	+ fuse_io_invalbuf(vp, td);
	+ } else {
	+ if ((foo->open_flags & FOPEN_KEEP_CACHE) == 0)
	+ fuse_io_invalbuf(vp, td);
	+ VTOFUD(vp)->flag &= ~FN_DIRECTIO;
	+ }
	+
	+}
	+
	+void
	+fuse_file_init(void)
	+{
	+ fuse_fh_count = counter_u64_alloc(M_WAITOK);
	+ counter_u64_zero(fuse_fh_count);
	+}
	+
	+void
	+fuse_file_destroy(void)
	+{
	+ counter_u64_free(fuse_fh_count);
	}
	Index: sys/fs/fuse/fuse_internal.h
	===================================================================
	--- sys/fs/fuse/fuse_internal.h
	+++ sys/fs/fuse/fuse_internal.h
	@@ -32,6 +32,11 @@
	*
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	+ *
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	@@ -61,6 +66,7 @@
	#define _FUSE_INTERNAL_H_

	#include <sys/types.h>
	+#include <sys/counter.h>
	#include <sys/uio.h>
	#include <sys/stat.h>
	#include <sys/vnode.h>
	@@ -68,6 +74,9 @@
	#include "fuse_ipc.h"
	#include "fuse_node.h"

	+extern counter_u64_t fuse_lookup_cache_hits;
	+extern counter_u64_t fuse_lookup_cache_misses;
	+
	static inline bool
	vfs_isrdonly(struct mount *mp)
	{
	@@ -80,12 +89,6 @@
	return (vp->v_mount);
	}

	-static inline bool
	-vnode_mountedhere(struct vnode *vp)
	-{
	- return (vp->v_mountedhere != NULL);
	-}
	-
	static inline enum vtype
	vnode_vtype(struct vnode *vp)
	{
	@@ -134,12 +137,6 @@
	uio->uio_offset = offset;
	}

	-static inline void
	-uio_setresid(struct uio *uio, ssize_t resid)
	-{
	- uio->uio_resid = resid;
	-}
	-
	/* miscellaneous */

	static inline bool
	@@ -156,25 +153,57 @@
	return (vp->v_mount->mnt_stat.f_iosize);
	}

	-/* access */
	+/*
	+ * Make a cacheable timeout in bintime format value based on a fuse_attr_out
	+ * response
	+ */
	+static inline void
	+fuse_validity_2_bintime(uint64_t attr_valid, uint32_t attr_valid_nsec,
	+ struct bintime *timeout)
	+{
	+ struct timespec now, duration, timeout_ts;

	-#define FVP_ACCESS_NOOP 0x01
	+ getnanouptime(&now);
	+ /* "+ 2" is the bound of attr_valid_nsec + now.tv_nsec */
	+ /* Why oh why isn't there a TIME_MAX defined? */
	+ if (attr_valid >= INT_MAX \|\| attr_valid + now.tv_sec + 2 >= INT_MAX) {
	+ timeout->sec = INT_MAX;
	+ } else {
	+ duration.tv_sec = attr_valid;
	+ duration.tv_nsec = attr_valid_nsec;
	+ timespecadd(&duration, &now, &timeout_ts);
	+ timespec2bintime(&timeout_ts, timeout);
	+ }
	+}

	-#define FACCESS_VA_VALID 0x01
	-#define FACCESS_DO_ACCESS 0x02
	-#define FACCESS_STICKY 0x04
	-#define FACCESS_CHOWN 0x08
	-#define FACCESS_NOCHECKSPY 0x10
	-#define FACCESS_SETGID 0x12
	+/*
	+ * Make a cacheable timeout value in timespec format based on the fuse_entry_out
	+ * response
	+ */
	+static inline void
	+fuse_validity_2_timespec(const struct fuse_entry_out *feo,
	+ struct timespec *timeout)
	+{
	+ struct timespec duration, now;

	-#define FACCESS_XQUERIES (FACCESS_STICKY \| FACCESS_CHOWN \| FACCESS_SETGID)
	+ getnanouptime(&now);
	+ /* "+ 2" is the bound of entry_valid_nsec + now.tv_nsec */
	+ if (feo->entry_valid >= INT_MAX \|\|
	+ feo->entry_valid + now.tv_sec + 2 >= INT_MAX) {
	+ timeout->tv_sec = INT_MAX;
	+ } else {
	+ duration.tv_sec = feo->entry_valid;
	+ duration.tv_nsec = feo->entry_valid_nsec;
	+ timespecadd(&duration, &now, timeout);
	+ }
	+}

	-struct fuse_access_param {
	- uid_t xuid;
	- gid_t xgid;
	- uint32_t facc_flags;
	-};

	+/* VFS ops */
	+int
	+fuse_internal_get_cached_vnode(struct mount, ino_t, int, struct vnode*);
	+
	+/* access */
	static inline int
	fuse_match_cred(struct ucred basecred, struct ucred usercred)
	{
	@@ -189,8 +218,8 @@
	return (EPERM);
	}

	-int fuse_internal_access(struct vnode *vp, mode_t mode,
	- struct fuse_access_param facp, struct thread td, struct ucred *cred);
	+int fuse_internal_access(struct vnode *vp, accmode_t mode,
	+ struct thread td, struct ucred cred);

	/* attributes */
	void fuse_internal_cache_attrs(struct vnode vp, struct fuse_attr attr,
	@@ -198,21 +227,35 @@

	/* fsync */

	-int fuse_internal_fsync(struct vnode vp, struct thread td,
	- struct ucred cred, struct fuse_filehandle fufh);
	+int fuse_internal_fsync(struct vnode vp, struct thread td, int waitfor,
	+ bool datasync);
	int fuse_internal_fsync_callback(struct fuse_ticket tick, struct uio uio);

	-/* readdir */
	+/* getattr */
	+int fuse_internal_do_getattr(struct vnode vp, struct vattr vap,
	+ struct ucred cred, struct thread td);
	+int fuse_internal_getattr(struct vnode vp, struct vattr vap,
	+ struct ucred cred, struct thread td);

	+/* asynchronous invalidation */
	+int fuse_internal_invalidate_entry(struct mount mp, struct uio uio);
	+int fuse_internal_invalidate_inode(struct mount mp, struct uio uio);
	+
	+/* mknod */
	+int fuse_internal_mknod(struct vnode dvp, struct vnode *vpp,
	+ struct componentname cnp, struct vattr vap);
	+
	+/* readdir */
	struct pseudo_dirent {
	uint32_t d_namlen;
	};
	+int fuse_internal_readdir(struct vnode vp, struct uio uio, off_t startoff,
	+ struct fuse_filehandle fufh, struct fuse_iov cookediov, int *ncookies,
	+ u_long *cookies);
	+int fuse_internal_readdir_processdata(struct uio *uio, off_t startoff,
	+ int fnd_start, size_t reqsize, void buf, size_t bufsize,
	+ struct fuse_iov cookediov, int ncookies, u_long **cookiesp);

	-int fuse_internal_readdir(struct vnode vp, struct uio uio,
	- struct fuse_filehandle fufh, struct fuse_iov cookediov);
	-int fuse_internal_readdir_processdata(struct uio *uio, size_t reqsize,
	- void buf, size_t bufsize, void param);
	-
	/* remove */

	int fuse_internal_remove(struct vnode dvp, struct vnode vp,
	@@ -227,6 +270,10 @@

	void fuse_internal_vnode_disappear(struct vnode *vp);

	+/* setattr */
	+int fuse_internal_setattr(struct vnode vp, struct vattr va,
	+ struct thread td, struct ucred cred);
	+
	/* strategy */

	/* entity creation */
	@@ -270,5 +317,9 @@

	int fuse_internal_init_callback(struct fuse_ticket tick, struct uio uio);
	void fuse_internal_send_init(struct fuse_data data, struct thread td);
	+
	+/* module load/unload */
	+void fuse_internal_init(void);
	+void fuse_internal_destroy(void);

	#endif /* _FUSE_INTERNAL_H_ */
	Index: sys/fs/fuse/fuse_internal.c
	===================================================================
	--- sys/fs/fuse/fuse_internal.c
	+++ sys/fs/fuse/fuse_internal.c
	@@ -33,6 +33,11 @@
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	*
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	+ *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	@@ -59,8 +64,9 @@
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	-#include <sys/module.h>
	#include <sys/systm.h>
	+#include <sys/counter.h>
	+#include <sys/module.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/conf.h>
	@@ -89,35 +95,78 @@
	#include "fuse.h"
	#include "fuse_file.h"
	#include "fuse_internal.h"
	+#include "fuse_io.h"
	#include "fuse_ipc.h"
	#include "fuse_node.h"
	#include "fuse_file.h"
	-#include "fuse_param.h"

	-SDT_PROVIDER_DECLARE(fuse);
	+SDT_PROVIDER_DECLARE(fusefs);
	/*
	* Fuse trace probe:
	* arg0: verbosity. Higher numbers give more verbose messages
	* arg1: Textual message
	*/
	-SDT_PROBE_DEFINE2(fuse, , internal, trace, "int", "char*");
	+SDT_PROBE_DEFINE2(fusefs, , internal, trace, "int", "char*");

	#ifdef ZERO_PAD_INCOMPLETE_BUFS
	static int isbzero(void *buf, size_t len);

	#endif

	-/* access */
	+counter_u64_t fuse_lookup_cache_hits;
	+counter_u64_t fuse_lookup_cache_misses;

	+SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, lookup_cache_hits, CTLFLAG_RD,
	+ &fuse_lookup_cache_hits, "number of positive cache hits in lookup");
	+
	+SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, lookup_cache_misses, CTLFLAG_RD,
	+ &fuse_lookup_cache_misses, "number of cache misses in lookup");
	+
	int
	+fuse_internal_get_cached_vnode(struct mount* mp, ino_t ino, int flags,
	+ struct vnode **vpp)
	+{
	+ struct bintime now;
	+ struct thread *td = curthread;
	+ uint64_t nodeid = ino;
	+ int error;
	+
	+ *vpp = NULL;
	+
	+ error = vfs_hash_get(mp, fuse_vnode_hash(nodeid), flags, td, vpp,
	+ fuse_vnode_cmp, &nodeid);
	+ if (error)
	+ return error;
	+ /*
	+ * Check the entry cache timeout. We have to do this within fusefs
	+ * instead of by using cache_enter_time/cache_lookup because those
	+ * routines are only intended to work with pathnames, not inodes
	+ */
	+ if (*vpp != NULL) {
	+ getbinuptime(&now);
	+ if (bintime_cmp(&(VTOFUD(*vpp)->entry_cache_timeout), &now, >)){
	+ counter_u64_add(fuse_lookup_cache_hits, 1);
	+ return 0;
	+ } else {
	+ /* Entry cache timeout */
	+ counter_u64_add(fuse_lookup_cache_misses, 1);
	+ cache_purge(*vpp);
	+ vput(*vpp);
	+ *vpp = NULL;
	+ }
	+ }
	+ return 0;
	+}
	+
	+/* Synchronously send a FUSE_ACCESS operation */
	+int
	fuse_internal_access(struct vnode *vp,
	- mode_t mode,
	- struct fuse_access_param *facp,
	+ accmode_t mode,
	struct thread *td,
	struct ucred *cred)
	{
	int err = 0;
	- uint32_t mask = 0;
	+ uint32_t mask = F_OK;
	int dataflags;
	int vtype;
	struct mount *mp;
	@@ -125,77 +174,57 @@
	struct fuse_access_in *fai;
	struct fuse_data *data;

	- /* NOT YET DONE */
	- /*
	- * If this vnop gives you trouble, just return 0 here for a lazy
	- * kludge.
	- */
	- /* return 0;*/
	-
	mp = vnode_mount(vp);
	vtype = vnode_vtype(vp);

	data = fuse_get_mpdata(mp);
	dataflags = data->dataflags;

	- if ((mode & VWRITE) && vfs_isrdonly(mp)) {
	- return EACCES;
	- }
	- /* Unless explicitly permitted, deny everyone except the fs owner. */
	- if (vnode_isvroot(vp) && !(facp->facc_flags & FACCESS_NOCHECKSPY)) {
	- if (!(dataflags & FSESS_DAEMON_CAN_SPY)) {
	- int denied = fuse_match_cred(data->daemoncred,
	- cred);
	+ if (mode == 0)
	+ return 0;

	- if (denied) {
	- return EPERM;
	- }
	+ if (mode & VMODIFY_PERMS && vfs_isrdonly(mp)) {
	+ switch (vp->v_type) {
	+ case VDIR:
	+ /* FALLTHROUGH */
	+ case VLNK:
	+ /* FALLTHROUGH */
	+ case VREG:
	+ return EROFS;
	+ default:
	+ break;
	}
	- facp->facc_flags \|= FACCESS_NOCHECKSPY;
	}
	- if (!(facp->facc_flags & FACCESS_DO_ACCESS)) {
	- return 0;
	+
	+ /* Unless explicitly permitted, deny everyone except the fs owner. */
	+ if (!(dataflags & FSESS_DAEMON_CAN_SPY)) {
	+ if (fuse_match_cred(data->daemoncred, cred))
	+ return EPERM;
	}
	- if (((vtype == VREG) && (mode & VEXEC))) {
	-#ifdef NEED_MOUNT_ARGUMENT_FOR_THIS
	- /* Let the kernel handle this through open / close heuristics.*/
	- return ENOTSUP;
	-#else
	- /* Let the kernel handle this. */
	- return 0;
	-#endif
	- }
	- if (!fsess_isimpl(mp, FUSE_ACCESS)) {
	- /* Let the kernel handle this. */
	- return 0;
	- }
	+
	if (dataflags & FSESS_DEFAULT_PERMISSIONS) {
	- /* Let the kernel handle this. */
	- return 0;
	+ struct vattr va;
	+
	+ fuse_internal_getattr(vp, &va, cred, td);
	+ return vaccess(vp->v_type, va.va_mode, va.va_uid,
	+ va.va_gid, mode, cred, NULL);
	}
	- if ((mode & VADMIN) != 0) {
	- err = priv_check_cred(cred, PRIV_VFS_ADMIN);
	- if (err) {
	- return err;
	- }
	- }
	- if ((mode & (VWRITE \| VAPPEND \| VADMIN)) != 0) {
	+
	+ if (!fsess_isimpl(mp, FUSE_ACCESS))
	+ return 0;
	+
	+ if ((mode & (VWRITE \| VAPPEND \| VADMIN)) != 0)
	mask \|= W_OK;
	- }
	- if ((mode & VREAD) != 0) {
	+ if ((mode & VREAD) != 0)
	mask \|= R_OK;
	- }
	- if ((mode & VEXEC) != 0) {
	+ if ((mode & VEXEC) != 0)
	mask \|= X_OK;
	- }
	- bzero(&fdi, sizeof(fdi));

	fdisp_init(&fdi, sizeof(*fai));
	fdisp_make_vp(&fdi, FUSE_ACCESS, vp, td, cred);

	fai = fdi.indata;
	- fai->mask = F_OK;
	- fai->mask \|= mask;
	+ fai->mask = mask;

	err = fdisp_wait_answ(&fdi);
	fdisp_destroy(&fdi);
	@@ -208,9 +237,9 @@
	}

	/*
	- * Cache FUSE attributes from feo, in attr cache associated with vnode 'vp'.
	- * Optionally, if argument 'vap' is not NULL, store a copy of the converted
	- * attributes there as well.
	+ * Cache FUSE attributes from attr, in attribute cache associated with vnode
	+ * 'vp'. Optionally, if argument 'vap' is not NULL, store a copy of the
	+ * converted attributes there as well.
	*
	* If the nominal attribute cache TTL is zero, do not cache on the 'vp' (but do
	* return the result to the caller).
	@@ -221,49 +250,57 @@
	{
	struct mount *mp;
	struct fuse_vnode_data *fvdat;
	+ struct fuse_data *data;
	struct vattr *vp_cache_at;

	mp = vnode_mount(vp);
	fvdat = VTOFUD(vp);
	+ data = fuse_get_mpdata(mp);

	- /* Honor explicit do-not-cache requests from user filesystems. */
	- if (attr_valid == 0 && attr_valid_nsec == 0)
	- fvdat->valid_attr_cache = false;
	- else
	- fvdat->valid_attr_cache = true;
	+ ASSERT_VOP_ELOCKED(vp, "fuse_internal_cache_attrs");

	- vp_cache_at = VTOVA(vp);
	+ fuse_validity_2_bintime(attr_valid, attr_valid_nsec,
	+ &fvdat->attr_cache_timeout);

	- if (vap == NULL && vp_cache_at == NULL)
	+ /* Fix our buffers if the filesize changed without us knowing */
	+ if (vnode_isreg(vp) && attr->size != fvdat->cached_attrs.va_size) {
	+ (void)fuse_vnode_setsize(vp, attr->size);
	+ fvdat->cached_attrs.va_size = attr->size;
	+ }
	+
	+ if (attr_valid > 0 \|\| attr_valid_nsec > 0)
	+ vp_cache_at = &(fvdat->cached_attrs);
	+ else if (vap != NULL)
	+ vp_cache_at = vap;
	+ else
	return;

	- if (vap == NULL)
	- vap = vp_cache_at;
	-
	- vattr_null(vap);
	-
	- vap->va_fsid = mp->mnt_stat.f_fsid.val[0];
	- vap->va_fileid = attr->ino;
	- vap->va_mode = attr->mode & ~S_IFMT;
	- vap->va_nlink = attr->nlink;
	- vap->va_uid = attr->uid;
	- vap->va_gid = attr->gid;
	- vap->va_rdev = attr->rdev;
	- vap->va_size = attr->size;
	+ vattr_null(vp_cache_at);
	+ vp_cache_at->va_fsid = mp->mnt_stat.f_fsid.val[0];
	+ vp_cache_at->va_fileid = attr->ino;
	+ vp_cache_at->va_mode = attr->mode & ~S_IFMT;
	+ vp_cache_at->va_nlink = attr->nlink;
	+ vp_cache_at->va_uid = attr->uid;
	+ vp_cache_at->va_gid = attr->gid;
	+ vp_cache_at->va_rdev = attr->rdev;
	+ vp_cache_at->va_size = attr->size;
	/* XXX on i386, seconds are truncated to 32 bits */
	- vap->va_atime.tv_sec = attr->atime;
	- vap->va_atime.tv_nsec = attr->atimensec;
	- vap->va_mtime.tv_sec = attr->mtime;
	- vap->va_mtime.tv_nsec = attr->mtimensec;
	- vap->va_ctime.tv_sec = attr->ctime;
	- vap->va_ctime.tv_nsec = attr->ctimensec;
	- vap->va_blocksize = PAGE_SIZE;
	- vap->va_type = IFTOVT(attr->mode);
	- vap->va_bytes = attr->blocks * S_BLKSIZE;
	- vap->va_flags = 0;
	+ vp_cache_at->va_atime.tv_sec = attr->atime;
	+ vp_cache_at->va_atime.tv_nsec = attr->atimensec;
	+ vp_cache_at->va_mtime.tv_sec = attr->mtime;
	+ vp_cache_at->va_mtime.tv_nsec = attr->mtimensec;
	+ vp_cache_at->va_ctime.tv_sec = attr->ctime;
	+ vp_cache_at->va_ctime.tv_nsec = attr->ctimensec;
	+ if (fuse_libabi_geq(data, 7, 9) && attr->blksize > 0)
	+ vp_cache_at->va_blocksize = attr->blksize;
	+ else
	+ vp_cache_at->va_blocksize = PAGE_SIZE;
	+ vp_cache_at->va_type = IFTOVT(attr->mode);
	+ vp_cache_at->va_bytes = attr->blocks * S_BLKSIZE;
	+ vp_cache_at->va_flags = 0;

	- if (vap != vp_cache_at && vp_cache_at != NULL)
	- memcpy(vp_cache_at, vap, sizeof(*vap));
	+ if (vap != vp_cache_at && vap != NULL)
	+ memcpy(vap, vp_cache_at, sizeof(*vap));
	}


	@@ -281,47 +318,195 @@
	int
	fuse_internal_fsync(struct vnode *vp,
	struct thread *td,
	- struct ucred *cred,
	- struct fuse_filehandle *fufh)
	+ int waitfor,
	+ bool datasync)
	{
	- int op = FUSE_FSYNC;
	- struct fuse_fsync_in *ffsi;
	+ struct fuse_fsync_in *ffsi = NULL;
	struct fuse_dispatcher fdi;
	+ struct fuse_filehandle *fufh;
	+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
	+ struct mount *mp = vnode_mount(vp);
	+ int op = FUSE_FSYNC;
	+ int err = 0;

	- if (vnode_isdir(vp)) {
	- op = FUSE_FSYNCDIR;
	+ if (!fsess_isimpl(vnode_mount(vp),
	+ (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) {
	+ return 0;
	}
	- fdisp_init(&fdi, sizeof(*ffsi));
	- fdisp_make_vp(&fdi, op, vp, td, cred);
	- ffsi = fdi.indata;
	- ffsi->fh = fufh->fh_id;
	+ if (vnode_isdir(vp))
	+ op = FUSE_FSYNCDIR;

	- ffsi->fsync_flags = 1; /* datasync */
	+ if (!fsess_isimpl(mp, op))
	+ return 0;

	- fuse_insert_callback(fdi.tick, fuse_internal_fsync_callback);
	- fuse_insert_message(fdi.tick);
	+ fdisp_init(&fdi, sizeof(*ffsi));
	+ /*
	+ * fsync every open file handle for this file, because we can't be sure
	+ * which file handle the caller is really referring to.
	+ */
	+ LIST_FOREACH(fufh, &fvdat->handles, next) {
	+ if (ffsi == NULL)
	+ fdisp_make_vp(&fdi, op, vp, td, NULL);
	+ else
	+ fdisp_refresh_vp(&fdi, op, vp, td, NULL);
	+ ffsi = fdi.indata;
	+ ffsi->fh = fufh->fh_id;
	+ ffsi->fsync_flags = 0;

	+ if (datasync)
	+ ffsi->fsync_flags = 1;
	+
	+ if (waitfor == MNT_WAIT) {
	+ err = fdisp_wait_answ(&fdi);
	+ } else {
	+ fuse_insert_callback(fdi.tick,
	+ fuse_internal_fsync_callback);
	+ fuse_insert_message(fdi.tick, false);
	+ }
	+ if (err == ENOSYS) {
	+ /* ENOSYS means "success, and don't call again" */
	+ fsess_set_notimpl(mp, op);
	+ err = 0;
	+ break;
	+ }
	+ }
	fdisp_destroy(&fdi);

	- return 0;
	+ return err;
	+}

	+/* Asynchronous invalidation */
	+SDT_PROBE_DEFINE2(fusefs, , internal, invalidate_cache_hit,
	+ "struct vnode", "struct vnode");
	+int
	+fuse_internal_invalidate_entry(struct mount mp, struct uio uio)
	+{
	+ struct fuse_notify_inval_entry_out fnieo;
	+ struct componentname cn;
	+ struct vnode dvp, vp;
	+ char name[PATH_MAX];
	+ int err;
	+
	+ if ((err = uiomove(&fnieo, sizeof(fnieo), uio)) != 0)
	+ return (err);
	+
	+ if ((err = uiomove(name, fnieo.namelen, uio)) != 0)
	+ return (err);
	+ name[fnieo.namelen] = '\0';
	+ /* fusefs does not cache "." or ".." entries */
	+ if (strncmp(name, ".", sizeof(".")) == 0 \|\|
	+ strncmp(name, "..", sizeof("..")) == 0)
	+ return (0);
	+
	+ if (fnieo.parent == FUSE_ROOT_ID)
	+ err = VFS_ROOT(mp, LK_SHARED, &dvp);
	+ else
	+ err = fuse_internal_get_cached_vnode( mp, fnieo.parent,
	+ LK_SHARED, &dvp);
	+ /*
	+ * If dvp is not in the cache, then it must've been reclaimed. And
	+ * since fuse_vnop_reclaim does a cache_purge, name's entry must've
	+ * been invalidated already. So we can safely return if dvp == NULL
	+ */
	+ if (err != 0 \|\| dvp == NULL)
	+ return (err);
	+ /*
	+ * XXX we can't check dvp's generation because the FUSE invalidate
	+ * entry message doesn't include it. Worse case is that we invalidate
	+ * an entry that didn't need to be invalidated.
	+ */
	+
	+ cn.cn_nameiop = LOOKUP;
	+ cn.cn_flags = 0; /* !MAKEENTRY means free cached entry */
	+ cn.cn_thread = curthread;
	+ cn.cn_cred = curthread->td_ucred;
	+ cn.cn_lkflags = LK_SHARED;
	+ cn.cn_pnbuf = NULL;
	+ cn.cn_nameptr = name;
	+ cn.cn_namelen = fnieo.namelen;
	+ err = cache_lookup(dvp, &vp, &cn, NULL, NULL);
	+ MPASS(err == 0);
	+ fuse_vnode_clear_attr_cache(dvp);
	+ vput(dvp);
	+ return (0);
	}

	+int
	+fuse_internal_invalidate_inode(struct mount mp, struct uio uio)
	+{
	+ struct fuse_notify_inval_inode_out fniio;
	+ struct vnode *vp;
	+ int err;
	+
	+ if ((err = uiomove(&fniio, sizeof(fniio), uio)) != 0)
	+ return (err);
	+
	+ if (fniio.ino == FUSE_ROOT_ID)
	+ err = VFS_ROOT(mp, LK_EXCLUSIVE, &vp);
	+ else
	+ err = fuse_internal_get_cached_vnode(mp, fniio.ino, LK_SHARED,
	+ &vp);
	+ if (err != 0 \|\| vp == NULL)
	+ return (err);
	+ /*
	+ * XXX we can't check vp's generation because the FUSE invalidate
	+ * entry message doesn't include it. Worse case is that we invalidate
	+ * an inode that didn't need to be invalidated.
	+ */
	+
	+ /*
	+ * Flush and invalidate buffers if off >= 0. Technically we only need
	+ * to flush and invalidate the range of offsets [off, off + len), but
	+ * for simplicity's sake we do everything.
	+ */
	+ if (fniio.off >= 0)
	+ fuse_io_invalbuf(vp, curthread);
	+ fuse_vnode_clear_attr_cache(vp);
	+ vput(vp);
	+ return (0);
	+}
	+
	+/* mknod */
	+int
	+fuse_internal_mknod(struct vnode dvp, struct vnode *vpp,
	+ struct componentname cnp, struct vattr vap)
	+{
	+ struct fuse_data *data;
	+ struct fuse_mknod_in fmni;
	+ size_t insize;
	+
	+ data = fuse_get_mpdata(dvp->v_mount);
	+
	+ fmni.mode = MAKEIMODE(vap->va_type, vap->va_mode);
	+ fmni.rdev = vap->va_rdev;
	+ if (fuse_libabi_geq(data, 7, 12)) {
	+ insize = sizeof(fmni);
	+ fmni.umask = curthread->td_proc->p_fd->fd_cmask;
	+ } else {
	+ insize = FUSE_COMPAT_MKNOD_IN_SIZE;
	+ }
	+ return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKNOD, &fmni,
	+ insize, vap->va_type));
	+}
	+
	/* readdir */

	int
	fuse_internal_readdir(struct vnode *vp,
	struct uio *uio,
	+ off_t startoff,
	struct fuse_filehandle *fufh,
	- struct fuse_iov *cookediov)
	+ struct fuse_iov *cookediov,
	+ int *ncookies,
	+ u_long *cookies)
	{
	int err = 0;
	struct fuse_dispatcher fdi;
	- struct fuse_read_in *fri;
	+ struct fuse_read_in *fri = NULL;
	+ int fnd_start;

	- if (uio_resid(uio) == 0) {
	+ if (uio_resid(uio) == 0)
	return 0;
	- }
	fdisp_init(&fdi, 0);

	/*
	@@ -329,51 +514,70 @@
	* I/O).
	*/

	+ /*
	+ * fnd_start is set non-zero once the offset in the directory gets
	+ * to the startoff. This is done because directories must be read
	+ * from the beginning (offset == 0) when fuse_vnop_readdir() needs
	+ * to do an open of the directory.
	+ * If it is not set non-zero here, it will be set non-zero in
	+ * fuse_internal_readdir_processdata() when uio_offset == startoff.
	+ */
	+ fnd_start = 0;
	+ if (uio->uio_offset == startoff)
	+ fnd_start = 1;
	while (uio_resid(uio) > 0) {
	-
	fdi.iosize = sizeof(*fri);
	- fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
	+ if (fri == NULL)
	+ fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
	+ else
	+ fdisp_refresh_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);

	fri = fdi.indata;
	fri->fh = fufh->fh_id;
	fri->offset = uio_offset(uio);
	- fri->size = min(uio_resid(uio), FUSE_DEFAULT_IOSIZE);
	- /* mp->max_read */
	+ fri->size = MIN(uio->uio_resid,
	+ fuse_get_mpdata(vp->v_mount)->max_read);

	- if ((err = fdisp_wait_answ(&fdi))) {
	+ if ((err = fdisp_wait_answ(&fdi)))
	break;
	- }
	- if ((err = fuse_internal_readdir_processdata(uio, fri->size, fdi.answ,
	- fdi.iosize, cookediov))) {
	+ if ((err = fuse_internal_readdir_processdata(uio, startoff,
	+ &fnd_start, fri->size, fdi.answ, fdi.iosize, cookediov,
	+ ncookies, &cookies)))
	break;
	- }
	}

	fdisp_destroy(&fdi);
	return ((err == -1) ? 0 : err);
	}

	+/*
	+ * Return -1 to indicate that this readdir is finished, 0 if it copied
	+ * all the directory data read in and it may be possible to read more
	+ * and greater than 0 for a failure.
	+ */
	int
	fuse_internal_readdir_processdata(struct uio *uio,
	+ off_t startoff,
	+ int *fnd_start,
	size_t reqsize,
	void *buf,
	size_t bufsize,
	- void *param)
	+ struct fuse_iov *cookediov,
	+ int *ncookies,
	+ u_long **cookiesp)
	{
	int err = 0;
	- int cou = 0;
	int bytesavail;
	size_t freclen;

	struct dirent *de;
	struct fuse_dirent *fudge;
	- struct fuse_iov *cookediov = param;
	+ u_long *cookies;

	- if (bufsize < FUSE_NAME_OFFSET) {
	+ cookies = *cookiesp;
	+ if (bufsize < FUSE_NAME_OFFSET)
	return -1;
	- }
	for (;;) {
	-
	if (bufsize < FUSE_NAME_OFFSET) {
	err = -1;
	break;
	@@ -381,10 +585,12 @@
	fudge = (struct fuse_dirent *)buf;
	freclen = FUSE_DIRENT_SIZE(fudge);

	- cou++;
	-
	if (bufsize < freclen) {
	- err = ((cou == 1) ? -1 : 0);
	+ /*
	+ * This indicates a partial directory entry at the
	+ * end of the directory data.
	+ */
	+ err = -1;
	break;
	}
	#ifdef ZERO_PAD_INCOMPLETE_BUFS
	@@ -402,30 +608,47 @@
	&fudge->namelen);

	if (bytesavail > uio_resid(uio)) {
	+ /* Out of space for the dir so we are done. */
	err = -1;
	break;
	}
	- fiov_refresh(cookediov);
	- fiov_adjust(cookediov, bytesavail);
	+ /*
	+ * Don't start to copy the directory entries out until
	+ * the requested offset in the directory is found.
	+ */
	+ if (*fnd_start != 0) {
	+ fiov_adjust(cookediov, bytesavail);
	+ bzero(cookediov->base, bytesavail);

	- de = (struct dirent *)cookediov->base;
	- de->d_fileno = fudge->ino;
	- de->d_reclen = bytesavail;
	- de->d_type = fudge->type;
	- de->d_namlen = fudge->namelen;
	- memcpy((char *)cookediov->base + sizeof(struct dirent) -
	- MAXNAMLEN - 1,
	- (char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
	- dirent_terminate(de);
	+ de = (struct dirent *)cookediov->base;
	+ de->d_fileno = fudge->ino;
	+ de->d_reclen = bytesavail;
	+ de->d_type = fudge->type;
	+ de->d_namlen = fudge->namelen;
	+ memcpy((char *)cookediov->base + sizeof(struct dirent) -
	+ MAXNAMLEN - 1,
	+ (char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
	+ dirent_terminate(de);

	- err = uiomove(cookediov->base, cookediov->len, uio);
	- if (err) {
	- break;
	- }
	+ err = uiomove(cookediov->base, cookediov->len, uio);
	+ if (err)
	+ break;
	+ if (cookies != NULL) {
	+ if (*ncookies == 0) {
	+ err = -1;
	+ break;
	+ }
	+ *cookies = fudge->off;
	+ cookies++;
	+ (*ncookies)--;
	+ }
	+ } else if (startoff == fudge->off)
	+ *fnd_start = 1;
	buf = (char *)buf + freclen;
	bufsize -= freclen;
	uio_setoffset(uio, fudge->off);
	}
	+ *cookiesp = cookies;

	return err;
	}
	@@ -439,12 +662,9 @@
	enum fuse_opcode op)
	{
	struct fuse_dispatcher fdi;
	- struct fuse_vnode_data *fvdat;
	- int err;
	+ nlink_t nlink;
	+ int err = 0;

	- err = 0;
	- fvdat = VTOFUD(vp);
	-
	fdisp_init(&fdi, cnp->cn_namelen + 1);
	fdisp_make_vp(&fdi, op, dvp, cnp->cn_thread, cnp->cn_cred);

	@@ -453,6 +673,35 @@

	err = fdisp_wait_answ(&fdi);
	fdisp_destroy(&fdi);
	+
	+ if (err)
	+ return (err);
	+
	+ /*
	+ * Access the cached nlink even if the attr cached has expired. If
	+ * it's inaccurate, the worst that will happen is:
	+ * 1) We'll recycle the vnode even though the file has another link we
	+ * don't know about, costing a bit of cpu time, or
	+ * 2) We won't recycle the vnode even though all of its links are gone.
	+ * It will linger around until vnlru reclaims it, costing a bit of
	+ * temporary memory.
	+ */
	+ nlink = VTOFUD(vp)->cached_attrs.va_nlink--;
	+
	+ /*
	+ * Purge the parent's attribute cache because the daemon
	+ * should've updated its mtime and ctime.
	+ */
	+ fuse_vnode_clear_attr_cache(dvp);
	+
	+ /* NB: nlink could be zero if it was never cached */
	+ if (nlink <= 1 \|\| vnode_vtype(vp) == VDIR) {
	+ fuse_internal_vnode_disappear(vp);
	+ } else {
	+ cache_purge(vp);
	+ fuse_vnode_update(vp, FN_CTIMECHANGE);
	+ }
	+
	return err;
	}

	@@ -532,6 +781,13 @@
	feo->nodeid, 1);
	return err;
	}
	+
	+ /*
	+ * Purge the parent's attribute cache because the daemon should've
	+ * updated its mtime and ctime
	+ */
	+ fuse_vnode_clear_attr_cache(dvp);
	+
	fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
	feo->attr_valid_nsec, NULL);

	@@ -593,10 +849,79 @@
	ffi = fdi.indata;
	ffi->nlookup = nlookup;

	- fuse_insert_message(fdi.tick);
	+ fuse_insert_message(fdi.tick, false);
	fdisp_destroy(&fdi);
	}

	+/* Fetch the vnode's attributes from the daemon*/
	+int
	+fuse_internal_do_getattr(struct vnode vp, struct vattr vap,
	+ struct ucred cred, struct thread td)
	+{
	+ struct fuse_dispatcher fdi;
	+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
	+ struct fuse_getattr_in *fgai;
	+ struct fuse_attr_out *fao;
	+ off_t old_filesize = fvdat->cached_attrs.va_size;
	+ struct timespec old_ctime = fvdat->cached_attrs.va_ctime;
	+ struct timespec old_mtime = fvdat->cached_attrs.va_mtime;
	+ enum vtype vtyp;
	+ int err;
	+
	+ fdisp_init(&fdi, 0);
	+ fdisp_make_vp(&fdi, FUSE_GETATTR, vp, td, cred);
	+ fgai = fdi.indata;
	+ /*
	+ * We could look up a file handle and set it in fgai->fh, but that
	+ * involves extra runtime work and I'm unaware of any file systems that
	+ * care.
	+ */
	+ fgai->getattr_flags = 0;
	+ if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) {
	+ if (err == ENOENT)
	+ fuse_internal_vnode_disappear(vp);
	+ goto out;
	+ }
	+
	+ fao = (struct fuse_attr_out *)fdi.answ;
	+ vtyp = IFTOVT(fao->attr.mode);
	+ if (fvdat->flag & FN_SIZECHANGE)
	+ fao->attr.size = old_filesize;
	+ if (fvdat->flag & FN_CTIMECHANGE) {
	+ fao->attr.ctime = old_ctime.tv_sec;
	+ fao->attr.ctimensec = old_ctime.tv_nsec;
	+ }
	+ if (fvdat->flag & FN_MTIMECHANGE) {
	+ fao->attr.mtime = old_mtime.tv_sec;
	+ fao->attr.mtimensec = old_mtime.tv_nsec;
	+ }
	+ fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
	+ fao->attr_valid_nsec, vap);
	+ if (vtyp != vnode_vtype(vp)) {
	+ fuse_internal_vnode_disappear(vp);
	+ err = ENOENT;
	+ }
	+
	+out:
	+ fdisp_destroy(&fdi);
	+ return err;
	+}
	+
	+/* Read a vnode's attributes from cache or fetch them from the fuse daemon */
	+int
	+fuse_internal_getattr(struct vnode vp, struct vattr vap, struct ucred *cred,
	+ struct thread *td)
	+{
	+ struct vattr *attrs;
	+
	+ if ((attrs = VTOVA(vp)) != NULL) {
	+ vap = attrs; /* struct copy */
	+ return 0;
	+ }
	+
	+ return fuse_internal_do_getattr(vp, vap, cred, td);
	+}
	+
	void
	fuse_internal_vnode_disappear(struct vnode *vp)
	{
	@@ -604,7 +929,6 @@

	ASSERT_VOP_ELOCKED(vp, "fuse_internal_vnode_disappear");
	fvdat->flag \|= FN_REVOKED;
	- fvdat->valid_attr_cache = false;
	cache_purge(vp);
	}

	@@ -625,27 +949,69 @@
	}
	fiio = fticket_resp(tick)->base;

	- /* XXX: Do we want to check anything further besides this? */
	- if (fiio->major < 7) {
	- SDT_PROBE2(fuse, , internal, trace, 1,
	+ data->fuse_libabi_major = fiio->major;
	+ data->fuse_libabi_minor = fiio->minor;
	+ if (!fuse_libabi_geq(data, 7, 4)) {
	+ /*
	+ * With a little work we could support servers as old as 7.1.
	+ * But there would be little payoff.
	+ */
	+ SDT_PROBE2(fusefs, , internal, trace, 1,
	"userpace version too low");
	err = EPROTONOSUPPORT;
	goto out;
	}
	- data->fuse_libabi_major = fiio->major;
	- data->fuse_libabi_minor = fiio->minor;

	if (fuse_libabi_geq(data, 7, 5)) {
	- if (fticket_resp(tick)->len == sizeof(struct fuse_init_out)) {
	+ if (fticket_resp(tick)->len == sizeof(struct fuse_init_out) \|\|
	+ fticket_resp(tick)->len == FUSE_COMPAT_22_INIT_OUT_SIZE) {
	data->max_write = fiio->max_write;
	+ if (fiio->flags & FUSE_ASYNC_READ)
	+ data->dataflags \|= FSESS_ASYNC_READ;
	+ if (fiio->flags & FUSE_POSIX_LOCKS)
	+ data->dataflags \|= FSESS_POSIX_LOCKS;
	+ if (fiio->flags & FUSE_EXPORT_SUPPORT)
	+ data->dataflags \|= FSESS_EXPORT_SUPPORT;
	+ /*
	+ * Don't bother to check FUSE_BIG_WRITES, because it's
	+ * redundant with max_write
	+ */
	+ /*
	+ * max_background and congestion_threshold are not
	+ * implemented
	+ */
	} else {
	err = EINVAL;
	}
	} else {
	- /* Old fix values */
	+ /* Old fixed values */
	data->max_write = 4096;
	}

	+ if (fuse_libabi_geq(data, 7, 6))
	+ data->max_readahead_blocks = fiio->max_readahead / maxbcachebuf;
	+
	+ if (!fuse_libabi_geq(data, 7, 7))
	+ fsess_set_notimpl(data->mp, FUSE_INTERRUPT);
	+
	+ if (!fuse_libabi_geq(data, 7, 8)) {
	+ fsess_set_notimpl(data->mp, FUSE_BMAP);
	+ fsess_set_notimpl(data->mp, FUSE_DESTROY);
	+ }
	+
	+ if (fuse_libabi_geq(data, 7, 23) && fiio->time_gran >= 1 &&
	+ fiio->time_gran <= 1000000000)
	+ data->time_gran = fiio->time_gran;
	+ else
	+ data->time_gran = 1;
	+
	+ if (!fuse_libabi_geq(data, 7, 23))
	+ data->cache_mode = fuse_data_cache_mode;
	+ else if (fiio->flags & FUSE_WRITEBACK_CACHE)
	+ data->cache_mode = FUSE_CACHE_WB;
	+ else
	+ data->cache_mode = FUSE_CACHE_WT;
	+
	out:
	if (err) {
	fdata_set_dead(data);
	@@ -669,14 +1035,156 @@
	fiii = fdi.indata;
	fiii->major = FUSE_KERNEL_VERSION;
	fiii->minor = FUSE_KERNEL_MINOR_VERSION;
	- fiii->max_readahead = FUSE_DEFAULT_IOSIZE * 16;
	- fiii->flags = 0;
	+ /*
	+ * fusefs currently reads ahead no more than one cache block at a time.
	+ * See fuse_read_biobackend
	+ */
	+ fiii->max_readahead = maxbcachebuf;
	+ /*
	+ * Unsupported features:
	+ * FUSE_FILE_OPS: No known FUSE server or client supports it
	+ * FUSE_ATOMIC_O_TRUNC: our VFS cannot support it
	+ * FUSE_DONT_MASK: unlike Linux, FreeBSD always applies the umask, even
	+ * when default ACLs are in use.
	+ * FUSE_SPLICE_WRITE, FUSE_SPLICE_MOVE, FUSE_SPLICE_READ: FreeBSD
	+ * doesn't have splice(2).
	+ * FUSE_FLOCK_LOCKS: not yet implemented
	+ * FUSE_HAS_IOCTL_DIR: not yet implemented
	+ * FUSE_AUTO_INVAL_DATA: not yet implemented
	+ * FUSE_DO_READDIRPLUS: not yet implemented
	+ * FUSE_READDIRPLUS_AUTO: not yet implemented
	+ * FUSE_ASYNC_DIO: not yet implemented
	+ * FUSE_NO_OPEN_SUPPORT: not yet implemented
	+ */
	+ fiii->flags = FUSE_ASYNC_READ \| FUSE_POSIX_LOCKS \| FUSE_EXPORT_SUPPORT
	+ \| FUSE_BIG_WRITES \| FUSE_WRITEBACK_CACHE;

	fuse_insert_callback(fdi.tick, fuse_internal_init_callback);
	- fuse_insert_message(fdi.tick);
	+ fuse_insert_message(fdi.tick, false);
	fdisp_destroy(&fdi);
	}

	+/*
	+ * Send a FUSE_SETATTR operation with no permissions checks. If cred is NULL,
	+ * send the request with root credentials
	+ */
	+int fuse_internal_setattr(struct vnode vp, struct vattr vap,
	+ struct thread td, struct ucred cred)
	+{
	+ struct fuse_vnode_data *fvdat;
	+ struct fuse_dispatcher fdi;
	+ struct fuse_setattr_in *fsai;
	+ struct mount *mp;
	+ pid_t pid = td->td_proc->p_pid;
	+ struct fuse_data *data;
	+ int dataflags;
	+ int err = 0;
	+ enum vtype vtyp;
	+ int sizechanged = -1;
	+ uint64_t newsize = 0;
	+
	+ mp = vnode_mount(vp);
	+ fvdat = VTOFUD(vp);
	+ data = fuse_get_mpdata(mp);
	+ dataflags = data->dataflags;
	+
	+ fdisp_init(&fdi, sizeof(*fsai));
	+ fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred);
	+ if (!cred) {
	+ fdi.finh->uid = 0;
	+ fdi.finh->gid = 0;
	+ }
	+ fsai = fdi.indata;
	+ fsai->valid = 0;
	+
	+ if (vap->va_uid != (uid_t)VNOVAL) {
	+ fsai->uid = vap->va_uid;
	+ fsai->valid \|= FATTR_UID;
	+ }
	+ if (vap->va_gid != (gid_t)VNOVAL) {
	+ fsai->gid = vap->va_gid;
	+ fsai->valid \|= FATTR_GID;
	+ }
	+ if (vap->va_size != VNOVAL) {
	+ struct fuse_filehandle *fufh = NULL;
	+
	+ /Truncate to a new value. /
	+ fsai->size = vap->va_size;
	+ sizechanged = 1;
	+ newsize = vap->va_size;
	+ fsai->valid \|= FATTR_SIZE;
	+
	+ fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid);
	+ if (fufh) {
	+ fsai->fh = fufh->fh_id;
	+ fsai->valid \|= FATTR_FH;
	+ }
	+ VTOFUD(vp)->flag &= ~FN_SIZECHANGE;
	+ }
	+ if (vap->va_atime.tv_sec != VNOVAL) {
	+ fsai->atime = vap->va_atime.tv_sec;
	+ fsai->atimensec = vap->va_atime.tv_nsec;
	+ fsai->valid \|= FATTR_ATIME;
	+ if (vap->va_vaflags & VA_UTIMES_NULL)
	+ fsai->valid \|= FATTR_ATIME_NOW;
	+ }
	+ if (vap->va_mtime.tv_sec != VNOVAL) {
	+ fsai->mtime = vap->va_mtime.tv_sec;
	+ fsai->mtimensec = vap->va_mtime.tv_nsec;
	+ fsai->valid \|= FATTR_MTIME;
	+ if (vap->va_vaflags & VA_UTIMES_NULL)
	+ fsai->valid \|= FATTR_MTIME_NOW;
	+ } else if (fvdat->flag & FN_MTIMECHANGE) {
	+ fsai->mtime = fvdat->cached_attrs.va_mtime.tv_sec;
	+ fsai->mtimensec = fvdat->cached_attrs.va_mtime.tv_nsec;
	+ fsai->valid \|= FATTR_MTIME;
	+ }
	+ if (fuse_libabi_geq(data, 7, 23) && fvdat->flag & FN_CTIMECHANGE) {
	+ fsai->ctime = fvdat->cached_attrs.va_ctime.tv_sec;
	+ fsai->ctimensec = fvdat->cached_attrs.va_ctime.tv_nsec;
	+ fsai->valid \|= FATTR_CTIME;
	+ }
	+ if (vap->va_mode != (mode_t)VNOVAL) {
	+ fsai->mode = vap->va_mode & ALLPERMS;
	+ fsai->valid \|= FATTR_MODE;
	+ }
	+ if (!fsai->valid) {
	+ goto out;
	+ }
	+
	+ if ((err = fdisp_wait_answ(&fdi)))
	+ goto out;
	+ vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode);
	+
	+ if (vnode_vtype(vp) != vtyp) {
	+ if (vnode_vtype(vp) == VNON && vtyp != VNON) {
	+ SDT_PROBE2(fusefs, , internal, trace, 1, "FUSE: Dang! "
	+ "vnode_vtype is VNON and vtype isn't.");
	+ } else {
	+ /*
	+ * STALE vnode, ditch
	+ *
	+ * The vnode has changed its type "behind our back".
	+ * There's nothing really we can do, so let us just
	+ * force an internal revocation and tell the caller to
	+ * try again, if interested.
	+ */
	+ fuse_internal_vnode_disappear(vp);
	+ err = EAGAIN;
	+ }
	+ }
	+ if (err == 0) {
	+ struct fuse_attr_out fao = (struct fuse_attr_out)fdi.answ;
	+ fuse_vnode_undirty_cached_timestamps(vp);
	+ fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
	+ fao->attr_valid_nsec, NULL);
	+ }
	+
	+out:
	+ fdisp_destroy(&fdi);
	+ return err;
	+}
	+
	#ifdef ZERO_PAD_INCOMPLETE_BUFS
	static int
	isbzero(void *buf, size_t len)
	@@ -692,3 +1200,19 @@
	}

	#endif
	+
	+void
	+fuse_internal_init(void)
	+{
	+ fuse_lookup_cache_misses = counter_u64_alloc(M_WAITOK);
	+ counter_u64_zero(fuse_lookup_cache_misses);
	+ fuse_lookup_cache_hits = counter_u64_alloc(M_WAITOK);
	+ counter_u64_zero(fuse_lookup_cache_hits);
	+}
	+
	+void
	+fuse_internal_destroy(void)
	+{
	+ counter_u64_free(fuse_lookup_cache_hits);
	+ counter_u64_free(fuse_lookup_cache_misses);
	+}
	Index: sys/fs/fuse/fuse_io.h
	===================================================================
	--- sys/fs/fuse/fuse_io.h
	+++ sys/fs/fuse/fuse_io.h
	@@ -32,6 +32,11 @@
	*
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	+ *
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	@@ -61,7 +66,7 @@
	#define _FUSE_IO_H_

	int fuse_io_dispatch(struct vnode vp, struct uio uio, int ioflag,
	- struct ucred *cred);
	+ struct ucred *cred, pid_t pid);
	int fuse_io_strategy(struct vnode vp, struct buf bp);
	int fuse_io_flushbuf(struct vnode vp, int waitfor, struct thread td);
	int fuse_io_invalbuf(struct vnode vp, struct thread td);
	Index: sys/fs/fuse/fuse_io.c
	===================================================================
	--- sys/fs/fuse/fuse_io.c
	+++ sys/fs/fuse/fuse_io.c
	@@ -33,6 +33,11 @@
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	*
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	+ *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	@@ -72,6 +77,7 @@
	#include <sys/sx.h>
	#include <sys/mutex.h>
	#include <sys/rwlock.h>
	+#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/mount.h>
	#include <sys/vnode.h>
	@@ -83,6 +89,7 @@
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/sysctl.h>
	+#include <sys/vmmeter.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	@@ -98,45 +105,108 @@
	#include "fuse_ipc.h"
	#include "fuse_io.h"

	-SDT_PROVIDER_DECLARE(fuse);
	/*
	+ * Set in a struct buf to indicate that the write came from the buffer cache
	+ * and the originating cred and pid are no longer known.
	+ */
	+#define B_FUSEFS_WRITE_CACHE B_FS_FLAG1
	+
	+SDT_PROVIDER_DECLARE(fusefs);
	+/*
	* Fuse trace probe:
	* arg0: verbosity. Higher numbers give more verbose messages
	* arg1: Textual message
	*/
	-SDT_PROBE_DEFINE2(fuse, , io, trace, "int", "char*");
	+SDT_PROBE_DEFINE2(fusefs, , io, trace, "int", "char*");

	+static void
	+fuse_io_clear_suid_on_write(struct vnode vp, struct ucred cred,
	+ struct thread *td);
	static int
	fuse_read_directbackend(struct vnode vp, struct uio uio,
	struct ucred cred, struct fuse_filehandle fufh);
	static int
	-fuse_read_biobackend(struct vnode vp, struct uio uio,
	- struct ucred cred, struct fuse_filehandle fufh);
	+fuse_read_biobackend(struct vnode vp, struct uio uio, int ioflag,
	+ struct ucred cred, struct fuse_filehandle fufh, pid_t pid);
	static int
	fuse_write_directbackend(struct vnode vp, struct uio uio,
	- struct ucred cred, struct fuse_filehandle fufh, int ioflag);
	+ struct ucred cred, struct fuse_filehandle fufh, off_t filesize,
	+ int ioflag, bool pages);
	static int
	fuse_write_biobackend(struct vnode vp, struct uio uio,
	- struct ucred cred, struct fuse_filehandle fufh, int ioflag);
	+ struct ucred cred, struct fuse_filehandle fufh, int ioflag, pid_t pid);

	-SDT_PROBE_DEFINE5(fuse, , io, io_dispatch, "struct vnode", "struct uio",
	+/*
	+ * FreeBSD clears the SUID and SGID bits on any write by a non-root user.
	+ */
	+static void
	+fuse_io_clear_suid_on_write(struct vnode vp, struct ucred cred,
	+ struct thread *td)
	+{
	+ struct fuse_data *data;
	+ struct mount *mp;
	+ struct vattr va;
	+ int dataflags;
	+
	+ mp = vnode_mount(vp);
	+ data = fuse_get_mpdata(mp);
	+ dataflags = data->dataflags;
	+
	+ if (dataflags & FSESS_DEFAULT_PERMISSIONS) {
	+ if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) {
	+ fuse_internal_getattr(vp, &va, cred, td);
	+ if (va.va_mode & (S_ISUID \| S_ISGID)) {
	+ mode_t mode = va.va_mode & ~(S_ISUID \| S_ISGID);
	+ /* Clear all vattr fields except mode */
	+ vattr_null(&va);
	+ va.va_mode = mode;
	+
	+ /*
	+ * Ignore fuse_internal_setattr's return value,
	+ * because at this point the write operation has
	+ * already succeeded and we don't want to return
	+ * failing status for that.
	+ */
	+ (void)fuse_internal_setattr(vp, &va, td, NULL);
	+ }
	+ }
	+ }
	+}
	+
	+SDT_PROBE_DEFINE5(fusefs, , io, io_dispatch, "struct vnode", "struct uio",
	"int", "struct ucred", "struct fuse_filehandle");
	+SDT_PROBE_DEFINE4(fusefs, , io, io_dispatch_filehandles_closed, "struct vnode*",
	+ "struct uio", "int", "struct ucred");
	int
	fuse_io_dispatch(struct vnode vp, struct uio uio, int ioflag,
	- struct ucred *cred)
	+ struct ucred *cred, pid_t pid)
	{
	struct fuse_filehandle *fufh;
	int err, directio;
	+ int fflag;
	+ bool closefufh = false;

	MPASS(vp->v_type == VREG \|\| vp->v_type == VDIR);

	- err = fuse_filehandle_getrw(vp,
	- (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh);
	- if (err) {
	+ fflag = (uio->uio_rw == UIO_READ) ? FREAD : FWRITE;
	+ err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
	+ if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) {
	+ /*
	+ * nfsd will do I/O without first doing VOP_OPEN. We
	+ * must implicitly open the file here
	+ */
	+ err = fuse_filehandle_open(vp, fflag, &fufh, curthread, cred);
	+ closefufh = true;
	+ }
	+ else if (err) {
	+ SDT_PROBE4(fusefs, , io, io_dispatch_filehandles_closed,
	+ vp, uio, ioflag, cred);
	printf("FUSE: io dispatch: filehandles are closed\n");
	return err;
	}
	- SDT_PROBE5(fuse, , io, io_dispatch, vp, uio, ioflag, cred, fufh);
	+ if (err)
	+ goto out;
	+ SDT_PROBE5(fusefs, , io, io_dispatch, vp, uio, ioflag, cred, fufh);

	/*
	* Ideally, when the daemon asks for direct io at open time, the
	@@ -153,108 +223,136 @@
	switch (uio->uio_rw) {
	case UIO_READ:
	if (directio) {
	- SDT_PROBE2(fuse, , io, trace, 1,
	+ SDT_PROBE2(fusefs, , io, trace, 1,
	"direct read of vnode");
	err = fuse_read_directbackend(vp, uio, cred, fufh);
	} else {
	- SDT_PROBE2(fuse, , io, trace, 1,
	+ SDT_PROBE2(fusefs, , io, trace, 1,
	"buffered read of vnode");
	- err = fuse_read_biobackend(vp, uio, cred, fufh);
	+ err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh,
	+ pid);
	}
	break;
	case UIO_WRITE:
	- /*
	- * Kludge: simulate write-through caching via write-around
	- * caching. Same effect, as far as never caching dirty data,
	- * but slightly pessimal in that newly written data is not
	- * cached.
	- */
	- if (directio \|\| fuse_data_cache_mode == FUSE_CACHE_WT) {
	- SDT_PROBE2(fuse, , io, trace, 1,
	+ fuse_vnode_update(vp, FN_MTIMECHANGE \| FN_CTIMECHANGE);
	+ if (directio) {
	+ const int iosize = fuse_iosize(vp);
	+ off_t start, end, filesize;
	+
	+ SDT_PROBE2(fusefs, , io, trace, 1,
	"direct write of vnode");
	- err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag);
	+
	+ err = fuse_vnode_size(vp, &filesize, cred, curthread);
	+ if (err)
	+ goto out;
	+
	+ start = uio->uio_offset;
	+ end = start + uio->uio_resid;
	+ KASSERT((ioflag & (IO_VMIO \| IO_DIRECT)) !=
	+ (IO_VMIO \| IO_DIRECT),
	+ ("IO_DIRECT used for a cache flush?"));
	+ /* Invalidate the write cache when writing directly */
	+ v_inval_buf_range(vp, start, end, iosize);
	+ err = fuse_write_directbackend(vp, uio, cred, fufh,
	+ filesize, ioflag, false);
	} else {
	- SDT_PROBE2(fuse, , io, trace, 1,
	+ SDT_PROBE2(fusefs, , io, trace, 1,
	"buffered write of vnode");
	- err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag);
	+ if (!fsess_opt_writeback(vnode_mount(vp)))
	+ ioflag \|= IO_SYNC;
	+ err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag,
	+ pid);
	}
	+ fuse_io_clear_suid_on_write(vp, cred, uio->uio_td);
	break;
	default:
	panic("uninterpreted mode passed to fuse_io_dispatch");
	}

	+out:
	+ if (closefufh)
	+ fuse_filehandle_close(vp, fufh, curthread, cred);
	+
	return (err);
	}

	-SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_start, "int", "int", "int");
	-SDT_PROBE_DEFINE2(fuse, , io, read_bio_backend_feed, "int", "int");
	-SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_end, "int", "ssize_t", "int");
	+SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int");
	+SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "struct buf*");
	+SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int",
	+ "struct buf*");
	static int
	-fuse_read_biobackend(struct vnode vp, struct uio uio,
	- struct ucred cred, struct fuse_filehandle fufh)
	+fuse_read_biobackend(struct vnode vp, struct uio uio, int ioflag,
	+ struct ucred cred, struct fuse_filehandle fufh, pid_t pid)
	{
	struct buf *bp;
	- daddr_t lbn;
	- int bcount;
	- int err = 0, n = 0, on = 0;
	+ struct mount *mp;
	+ struct fuse_data *data;
	+ daddr_t lbn, nextlbn;
	+ int bcount, nextsize;
	+ int err, n = 0, on = 0, seqcount;
	off_t filesize;

	const int biosize = fuse_iosize(vp);
	+ mp = vnode_mount(vp);
	+ data = fuse_get_mpdata(mp);

	- if (uio->uio_resid == 0)
	- return (0);
	if (uio->uio_offset < 0)
	return (EINVAL);

	- bcount = biosize;
	- filesize = VTOFUD(vp)->filesize;
	+ seqcount = ioflag >> IO_SEQSHIFT;

	- do {
	+ err = fuse_vnode_size(vp, &filesize, cred, curthread);
	+ if (err)
	+ return err;
	+
	+ for (err = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
	if (fuse_isdeadfs(vp)) {
	err = ENXIO;
	break;
	}
	+ if (filesize - uio->uio_offset <= 0)
	+ break;
	lbn = uio->uio_offset / biosize;
	on = uio->uio_offset & (biosize - 1);

	- SDT_PROBE3(fuse, , io, read_bio_backend_start,
	- biosize, (int)lbn, on);
	-
	- /*
	- * Obtain the buffer cache block. Figure out the buffer size
	- * when we are at EOF. If we are modifying the size of the
	- * buffer based on an EOF condition we need to hold
	- * nfs_rslock() through obtaining the buffer to prevent
	- * a potential writer-appender from messing with n_size.
	- * Otherwise we may accidentally truncate the buffer and
	- * lose dirty data.
	- *
	- * Note that bcount is not DEV_BSIZE aligned.
	- */
	if ((off_t)lbn * biosize >= filesize) {
	bcount = 0;
	} else if ((off_t)(lbn + 1) * biosize > filesize) {
	bcount = filesize - (off_t)lbn *biosize;
	+ } else {
	+ bcount = biosize;
	}
	- bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
	+ nextlbn = lbn + 1;
	+ nextsize = MIN(biosize, filesize - nextlbn * biosize);

	- if (!bp)
	- return (EINTR);
	+ SDT_PROBE4(fusefs, , io, read_bio_backend_start,
	+ biosize, (int)lbn, on, bcount);

	- /*
	- * If B_CACHE is not set, we must issue the read. If this
	- * fails, we return an error.
	- */
	+ if (bcount < biosize) {
	+ /* If near EOF, don't do readahead */
	+ err = bread(vp, lbn, bcount, NOCRED, &bp);
	+ } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
	+ /* Try clustered read */
	+ long totread = uio->uio_resid + on;
	+ seqcount = MIN(seqcount,
	+ data->max_readahead_blocks + 1);
	+ err = cluster_read(vp, filesize, lbn, bcount, NOCRED,
	+ totread, seqcount, 0, &bp);
	+ } else if (seqcount > 1 && data->max_readahead_blocks >= 1) {
	+ /* Try non-clustered readahead */
	+ err = breadn(vp, lbn, bcount, &nextlbn, &nextsize, 1,
	+ NOCRED, &bp);
	+ } else {
	+ /* Just read what was requested */
	+ err = bread(vp, lbn, bcount, NOCRED, &bp);
	+ }

	- if ((bp->b_flags & B_CACHE) == 0) {
	- bp->b_iocmd = BIO_READ;
	- vfs_busy_pages(bp, 0);
	- err = fuse_io_strategy(vp, bp);
	- if (err) {
	- brelse(bp);
	- return (err);
	- }
	+ if (err) {
	+ brelse(bp);
	+ bp = NULL;
	+ break;
	}
	+
	/*
	* on is the offset into the current bp. Figure out how many
	* bytes we can copy out of the bp. Note that bcount is
	@@ -264,33 +362,41 @@
	*/

	n = 0;
	- if (on < bcount)
	- n = MIN((unsigned)(bcount - on), uio->uio_resid);
	+ if (on < bcount - bp->b_resid)
	+ n = MIN((unsigned)(bcount - bp->b_resid - on),
	+ uio->uio_resid);
	if (n > 0) {
	- SDT_PROBE2(fuse, , io, read_bio_backend_feed,
	- n, n + (int)bp->b_resid);
	+ SDT_PROBE2(fusefs, , io, read_bio_backend_feed, n, bp);
	err = uiomove(bp->b_data + on, n, uio);
	}
	- brelse(bp);
	- SDT_PROBE3(fuse, , io, read_bio_backend_end, err,
	- uio->uio_resid, n);
	- } while (err == 0 && uio->uio_resid > 0 && n > 0);
	+ vfs_bio_brelse(bp, ioflag);
	+ SDT_PROBE4(fusefs, , io, read_bio_backend_end, err,
	+ uio->uio_resid, n, bp);
	+ if (bp->b_resid > 0) {
	+ /* Short read indicates EOF */
	+ break;
	+ }
	+ }

	return (err);
	}

	-SDT_PROBE_DEFINE1(fuse, , io, read_directbackend_start, "struct fuse_read_in*");
	-SDT_PROBE_DEFINE2(fuse, , io, read_directbackend_complete,
	- "struct fuse_dispatcher", "struct uio");
	+SDT_PROBE_DEFINE1(fusefs, , io, read_directbackend_start,
	+ "struct fuse_read_in*");
	+SDT_PROBE_DEFINE3(fusefs, , io, read_directbackend_complete,
	+ "struct fuse_dispatcher", "struct fuse_read_in", "struct uio*");

	static int
	fuse_read_directbackend(struct vnode vp, struct uio uio,
	struct ucred cred, struct fuse_filehandle fufh)
	{
	+ struct fuse_data *data;
	struct fuse_dispatcher fdi;
	struct fuse_read_in *fri;
	int err = 0;

	+ data = fuse_get_mpdata(vp->v_mount);
	+
	if (uio->uio_resid == 0)
	return (0);

	@@ -312,19 +418,29 @@
	fri->offset = uio->uio_offset;
	fri->size = MIN(uio->uio_resid,
	fuse_get_mpdata(vp->v_mount)->max_read);
	+ if (fuse_libabi_geq(data, 7, 9)) {
	+ /* See comment regarding FUSE_WRITE_LOCKOWNER */
	+ fri->read_flags = 0;
	+ fri->flags = fufh_type_2_fflags(fufh->fufh_type);
	+ }

	- SDT_PROBE1(fuse, , io, read_directbackend_start, fri);
	+ SDT_PROBE1(fusefs, , io, read_directbackend_start, fri);

	if ((err = fdisp_wait_answ(&fdi)))
	goto out;

	- SDT_PROBE2(fuse, , io, read_directbackend_complete,
	- fdi.iosize, uio);
	+ SDT_PROBE3(fusefs, , io, read_directbackend_complete,
	+ &fdi, fri, uio);

	if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio)))
	break;
	- if (fdi.iosize < fri->size)
	+ if (fdi.iosize < fri->size) {
	+ /*
	+ * Short read. Should only happen at EOF or with
	+ * direct io.
	+ */
	break;
	+ }
	}

	out:
	@@ -334,25 +450,57 @@

	static int
	fuse_write_directbackend(struct vnode vp, struct uio uio,
	- struct ucred cred, struct fuse_filehandle fufh, int ioflag)
	+ struct ucred cred, struct fuse_filehandle fufh, off_t filesize,
	+ int ioflag, bool pages)
	{
	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	+ struct fuse_data *data;
	struct fuse_write_in *fwi;
	+ struct fuse_write_out *fwo;
	struct fuse_dispatcher fdi;
	size_t chunksize;
	+ void *fwi_data;
	+ off_t as_written_offset;
	int diff;
	int err = 0;
	+ bool direct_io = fufh->fuse_open_flags & FOPEN_DIRECT_IO;
	+ bool wrote_anything = false;
	+ uint32_t write_flags;

	+ data = fuse_get_mpdata(vp->v_mount);
	+
	+ /*
	+ * Don't set FUSE_WRITE_LOCKOWNER in write_flags. It can't be set
	+ * accurately when using POSIX AIO, libfuse doesn't use it, and I'm not
	+ * aware of any file systems that do. It was an attempt to add
	+ * Linux-style mandatory locking to the FUSE protocol, but mandatory
	+ * locking is deprecated even on Linux. See Linux commit
	+ * f33321141b273d60cbb3a8f56a5489baad82ba5e .
	+ */
	+ /*
	+ * Set FUSE_WRITE_CACHE whenever we don't know the uid, gid, and/or pid
	+ * that originated a write. For example when writing from the
	+ * writeback cache. I don't know of a single file system that cares,
	+ * but the protocol says we're supposed to do this.
	+ */
	+ write_flags = !pages && (
	+ (ioflag & IO_DIRECT) \|\|
	+ !fsess_opt_datacache(vnode_mount(vp)) \|\|
	+ !fsess_opt_writeback(vnode_mount(vp))) ? 0 : FUSE_WRITE_CACHE;
	+
	if (uio->uio_resid == 0)
	return (0);
	+
	if (ioflag & IO_APPEND)
	- uio_setoffset(uio, fvdat->filesize);
	+ uio_setoffset(uio, filesize);

	+ if (vn_rlimit_fsize(vp, uio, uio->uio_td))
	+ return (EFBIG);
	+
	fdisp_init(&fdi, 0);

	while (uio->uio_resid > 0) {
	- chunksize = MIN(uio->uio_resid,
	- fuse_get_mpdata(vp->v_mount)->max_write);
	+ chunksize = MIN(uio->uio_resid, data->max_write);

	fdi.iosize = sizeof(*fwi) + chunksize;
	fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred);
	@@ -361,79 +509,140 @@
	fwi->fh = fufh->fh_id;
	fwi->offset = uio->uio_offset;
	fwi->size = chunksize;
	+ fwi->write_flags = write_flags;
	+ if (fuse_libabi_geq(data, 7, 9)) {
	+ fwi->flags = fufh_type_2_fflags(fufh->fufh_type);
	+ fwi_data = (char )fdi.indata + sizeof(fwi);
	+ } else {
	+ fwi_data = (char *)fdi.indata +
	+ FUSE_COMPAT_WRITE_IN_SIZE;
	+ }

	- if ((err = uiomove((char )fdi.indata + sizeof(fwi),
	- chunksize, uio)))
	+ if ((err = uiomove(fwi_data, chunksize, uio)))
	break;

	- if ((err = fdisp_wait_answ(&fdi)))
	+retry:
	+ err = fdisp_wait_answ(&fdi);
	+ if (err == ERESTART \|\| err == EINTR \|\| err == EWOULDBLOCK) {
	+ /*
	+ * Rewind the uio so dofilewrite will know it's
	+ * incomplete
	+ */
	+ uio->uio_resid += fwi->size;
	+ uio->uio_offset -= fwi->size;
	+ /*
	+ * Change ERESTART into EINTR because we can't rewind
	+ * uio->uio_iov. Basically, once uiomove(9) has been
	+ * called, it's impossible to restart a syscall.
	+ */
	+ if (err == ERESTART)
	+ err = EINTR;
	break;
	+ } else if (err) {
	+ break;
	+ } else {
	+ wrote_anything = true;
	+ }

	+ fwo = ((struct fuse_write_out *)fdi.answ);
	+
	/* Adjust the uio in the case of short writes */
	- diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size;
	+ diff = fwi->size - fwo->size;
	+ as_written_offset = uio->uio_offset - diff;
	+
	+ if (as_written_offset - diff > filesize)
	+ fuse_vnode_setsize(vp, as_written_offset);
	+ if (as_written_offset - diff >= filesize)
	+ fvdat->flag &= ~FN_SIZECHANGE;
	+
	if (diff < 0) {
	+ printf("WARNING: misbehaving FUSE filesystem "
	+ "wrote more data than we provided it\n");
	err = EINVAL;
	break;
	- } else if (diff > 0 && !(ioflag & IO_DIRECT)) {
	- /*
	- * XXX We really should be directly checking whether
	- * the file was opened with FOPEN_DIRECT_IO, not
	- * IO_DIRECT. IO_DIRECT can be set in multiple ways.
	- */
	- SDT_PROBE2(fuse, , io, trace, 1,
	- "misbehaving filesystem: short writes are only "
	- "allowed with direct_io");
	+ } else if (diff > 0) {
	+ /* Short write */
	+ if (!direct_io) {
	+ printf("WARNING: misbehaving FUSE filesystem: "
	+ "short writes are only allowed with "
	+ "direct_io\n");
	+ }
	+ if (ioflag & IO_DIRECT) {
	+ /* Return early */
	+ uio->uio_resid += diff;
	+ uio->uio_offset -= diff;
	+ break;
	+ } else {
	+ /* Resend the unwritten portion of data */
	+ fdi.iosize = sizeof(*fwi) + diff;
	+ /* Refresh fdi without clearing data buffer */
	+ fdisp_refresh_vp(&fdi, FUSE_WRITE, vp,
	+ uio->uio_td, cred);
	+ fwi = fdi.indata;
	+ MPASS2(fwi == fdi.indata, "FUSE dispatcher "
	+ "reallocated despite no increase in "
	+ "size?");
	+ void src = (char)fwi_data + fwo->size;
	+ memmove(fwi_data, src, diff);
	+ fwi->fh = fufh->fh_id;
	+ fwi->offset = as_written_offset;
	+ fwi->size = diff;
	+ fwi->write_flags = write_flags;
	+ goto retry;
	+ }
	}
	- uio->uio_resid += diff;
	- uio->uio_offset -= diff;
	-
	- if (uio->uio_offset > fvdat->filesize &&
	- fuse_data_cache_mode != FUSE_CACHE_UC) {
	- fuse_vnode_setsize(vp, uio->uio_offset);
	- fvdat->flag &= ~FN_SIZECHANGE;
	- }
	}

	fdisp_destroy(&fdi);

	+ if (wrote_anything)
	+ fuse_vnode_undirty_cached_timestamps(vp);
	+
	return (err);
	}

	-SDT_PROBE_DEFINE6(fuse, , io, write_biobackend_start, "int64_t", "int", "int",
	+SDT_PROBE_DEFINE6(fusefs, , io, write_biobackend_start, "int64_t", "int", "int",
	"struct uio*", "int", "bool");
	-SDT_PROBE_DEFINE2(fuse, , io, write_biobackend_append_race, "long", "int");
	+SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_append_race, "long", "int");
	+SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_issue, "int", "struct buf*");

	static int
	fuse_write_biobackend(struct vnode vp, struct uio uio,
	- struct ucred cred, struct fuse_filehandle fufh, int ioflag)
	+ struct ucred cred, struct fuse_filehandle fufh, int ioflag, pid_t pid)
	{
	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	struct buf *bp;
	daddr_t lbn;
	+ off_t filesize;
	int bcount;
	- int n, on, err = 0;
	+ int n, on, seqcount, err = 0;
	+ bool last_page;

	const int biosize = fuse_iosize(vp);

	- KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
	+ seqcount = ioflag >> IO_SEQSHIFT;
	+
	+ KASSERT(uio->uio_rw == UIO_WRITE, ("fuse_write_biobackend mode"));
	if (vp->v_type != VREG)
	return (EIO);
	if (uio->uio_offset < 0)
	return (EINVAL);
	if (uio->uio_resid == 0)
	return (0);
	+
	+ err = fuse_vnode_size(vp, &filesize, cred, curthread);
	+ if (err)
	+ return err;
	+
	if (ioflag & IO_APPEND)
	- uio_setoffset(uio, fvdat->filesize);
	+ uio_setoffset(uio, filesize);

	- /*
	- * Find all of this file's B_NEEDCOMMIT buffers. If our writes
	- * would exceed the local maximum per-file write commit size when
	- * combined with those, we must decide whether to flush,
	- * go synchronous, or return err. We don't bother checking
	- * IO_UNIT -- we just make all writes atomic anyway, as there's
	- * no point optimizing for something that really won't ever happen.
	- */
	+ if (vn_rlimit_fsize(vp, uio, uio->uio_td))
	+ return (EFBIG);
	+
	do {
	+ bool direct_append, extending;
	+
	if (fuse_isdeadfs(vp)) {
	err = ENXIO;
	break;
	@@ -443,66 +652,60 @@
	n = MIN((unsigned)(biosize - on), uio->uio_resid);

	again:
	- /*
	- * Handle direct append and file extension cases, calculate
	- * unaligned buffer size.
	- */
	- if (uio->uio_offset == fvdat->filesize && n) {
	- /*
	- * Get the buffer (in its pre-append state to maintain
	- * B_CACHE if it was previously set). Resize the
	- * nfsnode after we have locked the buffer to prevent
	- * readers from reading garbage.
	- */
	- bcount = on;
	- SDT_PROBE6(fuse, , io, write_biobackend_start,
	- lbn, on, n, uio, bcount, true);
	- bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
	-
	+ /* Get or create a buffer for the write */
	+ direct_append = uio->uio_offset == filesize && n;
	+ if (uio->uio_offset + n < filesize) {
	+ extending = false;
	+ if ((off_t)(lbn + 1) * biosize < filesize) {
	+ /* Not the file's last block */
	+ bcount = biosize;
	+ } else {
	+ /* The file's last block */
	+ bcount = filesize - (off_t)lbn * biosize;
	+ }
	+ } else {
	+ extending = true;
	+ bcount = on + n;
	+ }
	+ if (howmany(((off_t)lbn * biosize + on + n - 1), PAGE_SIZE) >=
	+ howmany(filesize, PAGE_SIZE))
	+ last_page = true;
	+ else
	+ last_page = false;
	+ if (direct_append) {
	+ /*
	+ * Take care to preserve the buffer's B_CACHE state so
	+ * as not to cause an unnecessary read.
	+ */
	+ bp = getblk(vp, lbn, on, PCATCH, 0, 0);
	if (bp != NULL) {
	- long save;
	-
	- err = fuse_vnode_setsize(vp,
	- uio->uio_offset + n);
	- if (err) {
	- brelse(bp);
	- break;
	- }
	- save = bp->b_flags & B_CACHE;
	- bcount += n;
	+ uint32_t save = bp->b_flags & B_CACHE;
	allocbuf(bp, bcount);
	bp->b_flags \|= save;
	}
	} else {
	- /*
	- * Obtain the locked cache block first, and then
	- * adjust the file's size as appropriate.
	- */
	- bcount = on + n;
	- if ((off_t)lbn * biosize + bcount < fvdat->filesize) {
	- if ((off_t)(lbn + 1) * biosize < fvdat->filesize)
	- bcount = biosize;
	- else
	- bcount = fvdat->filesize -
	- (off_t)lbn *biosize;
	- }
	- SDT_PROBE6(fuse, , io, write_biobackend_start,
	- lbn, on, n, uio, bcount, false);
	bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
	- if (bp && uio->uio_offset + n > fvdat->filesize) {
	- err = fuse_vnode_setsize(vp,
	- uio->uio_offset + n);
	- if (err) {
	- brelse(bp);
	- break;
	- }
	- }
	}
	-
	if (!bp) {
	err = EINTR;
	break;
	}
	+ if (extending) {
	+ /*
	+ * Extend file _after_ locking buffer so we won't race
	+ * with other readers
	+ */
	+ err = fuse_vnode_setsize(vp, uio->uio_offset + n);
	+ filesize = uio->uio_offset + n;
	+ fvdat->flag \|= FN_SIZECHANGE;
	+ if (err) {
	+ brelse(bp);
	+ break;
	+ }
	+ }
	+
	+ SDT_PROBE6(fusefs, , io, write_biobackend_start,
	+ lbn, on, n, uio, bcount, direct_append);
	/*
	* Issue a READ if B_CACHE is not set. In special-append
	* mode, B_CACHE is based on the buffer prior to the write
	@@ -535,6 +738,21 @@
	brelse(bp);
	break;
	}
	+ if (bp->b_resid > 0) {
	+ /*
	+ * Short read indicates EOF. Update file size
	+ * from the server and try again.
	+ */
	+ SDT_PROBE2(fusefs, , io, trace, 1,
	+ "Short read during a RMW");
	+ brelse(bp);
	+ err = fuse_vnode_size(vp, &filesize, cred,
	+ curthread);
	+ if (err)
	+ break;
	+ else
	+ goto again;
	+ }
	}
	if (bp->b_wcred == NOCRED)
	bp->b_wcred = crhold(cred);
	@@ -547,9 +765,8 @@
	* If the chopping creates a reverse-indexed or degenerate
	* situation with dirtyoff/end, we 0 both of them.
	*/
	-
	if (bp->b_dirtyend > bcount) {
	- SDT_PROBE2(fuse, , io, write_biobackend_append_race,
	+ SDT_PROBE2(fusefs, , io, write_biobackend_append_race,
	(long)bp->b_blkno * biosize,
	bp->b_dirtyend - bcount);
	bp->b_dirtyend = bcount;
	@@ -582,6 +799,7 @@
	* reasons: the only way to know if a write is valid
	* if its actually written out.)
	*/
	+ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 0, bp);
	bwrite(bp);
	if (bp->b_error == EINTR) {
	err = EINTR;
	@@ -591,19 +809,12 @@
	}
	err = uiomove((char *)bp->b_data + on, n, uio);

	- /*
	- * Since this block is being modified, it must be written
	- * again and not just committed. Since write clustering does
	- * not work for the stage 1 data write, only the stage 2
	- * commit rpc, we have to clear B_CLUSTEROK as well.
	- */
	- bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	-
	if (err) {
	bp->b_ioflags \|= BIO_ERROR;
	bp->b_error = err;
	brelse(bp);
	break;
	+ /* TODO: vfs_bio_clrbuf like ffs_write does? */
	}
	/*
	* Only update dirtyoff/dirtyend if not a degenerate
	@@ -619,42 +830,85 @@
	}
	vfs_bio_set_valid(bp, on, n);
	}
	- err = bwrite(bp);
	+
	+ vfs_bio_set_flags(bp, ioflag);
	+
	+ bp->b_flags \|= B_FUSEFS_WRITE_CACHE;
	+ if (ioflag & IO_SYNC) {
	+ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 2, bp);
	+ if (!(ioflag & IO_VMIO))
	+ bp->b_flags &= ~B_FUSEFS_WRITE_CACHE;
	+ err = bwrite(bp);
	+ } else if (vm_page_count_severe() \|\|
	+ buf_dirty_count_severe() \|\|
	+ (ioflag & IO_ASYNC)) {
	+ bp->b_flags \|= B_CLUSTEROK;
	+ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 3, bp);
	+ bawrite(bp);
	+ } else if (on == 0 && n == bcount) {
	+ if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
	+ bp->b_flags \|= B_CLUSTEROK;
	+ SDT_PROBE2(fusefs, , io, write_biobackend_issue,
	+ 4, bp);
	+ cluster_write(vp, bp, filesize, seqcount, 0);
	+ } else {
	+ SDT_PROBE2(fusefs, , io, write_biobackend_issue,
	+ 5, bp);
	+ bawrite(bp);
	+ }
	+ } else if (ioflag & IO_DIRECT) {
	+ bp->b_flags \|= B_CLUSTEROK;
	+ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 6, bp);
	+ bawrite(bp);
	+ } else {
	+ bp->b_flags &= ~B_CLUSTEROK;
	+ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 7, bp);
	+ bdwrite(bp);
	+ }
	if (err)
	break;
	} while (uio->uio_resid > 0 && n > 0);

	- if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0)
	- fuse_vnode_savesize(vp, cred);
	-
	return (err);
	}

	int
	fuse_io_strategy(struct vnode vp, struct buf bp)
	{
	- struct fuse_filehandle *fufh;
	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	+ struct fuse_filehandle *fufh;
	struct ucred *cred;
	struct uio *uiop;
	struct uio uio;
	struct iovec io;
	+ off_t filesize;
	int error = 0;
	+ int fflag;
	+ /* We don't know the true pid when we're dealing with the cache */
	+ pid_t pid = 0;

	const int biosize = fuse_iosize(vp);

	MPASS(vp->v_type == VREG \|\| vp->v_type == VDIR);
	MPASS(bp->b_iocmd == BIO_READ \|\| bp->b_iocmd == BIO_WRITE);

	- error = fuse_filehandle_getrw(vp,
	- (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh);
	+ fflag = bp->b_iocmd == BIO_READ ? FREAD : FWRITE;
	+ cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred;
	+ error = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
	+ if (bp->b_iocmd == BIO_READ && error == EBADF) {
	+ /*
	+ * This may be a read-modify-write operation on a cached file
	+ * opened O_WRONLY. The FUSE protocol allows this.
	+ */
	+ error = fuse_filehandle_get(vp, FWRITE, &fufh, cred, pid);
	+ }
	if (error) {
	printf("FUSE: strategy: filehandles are closed\n");
	bp->b_ioflags \|= BIO_ERROR;
	bp->b_error = error;
	+ bufdone(bp);
	return (error);
	}
	- cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred;

	uiop = &uio;
	uiop->uio_iov = &io;
	@@ -673,40 +927,57 @@
	KASSERT(!(bp->b_flags & B_DONE),
	("fuse_io_strategy: bp %p already marked done", bp));
	if (bp->b_iocmd == BIO_READ) {
	+ ssize_t left;
	+
	io.iov_len = uiop->uio_resid = bp->b_bcount;
	io.iov_base = bp->b_data;
	uiop->uio_rw = UIO_READ;

	- uiop->uio_offset = ((off_t)bp->b_blkno) * biosize;
	+ uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize;
	error = fuse_read_directbackend(vp, uiop, cred, fufh);
	+ /*
	+ * Store the amount we failed to read in the buffer's private
	+ * field, so callers can truncate the file if necessary'
	+ */

	- /* XXXCEM: Potentially invalid access to cached_attrs here */
	- if ((!error && uiop->uio_resid) \|\|
	- (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO &&
	- uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 &&
	- uiop->uio_offset >= fvdat->cached_attrs.va_size)) {
	- /*
	- * If we had a short read with no error, we must have
	- * hit a file hole. We should zero-fill the remainder.
	- * This can also occur if the server hits the file EOF.
	- *
	- * Holes used to be able to occur due to pending
	- * writes, but that is not possible any longer.
	- */
	+ if (!error && uiop->uio_resid) {
	int nread = bp->b_bcount - uiop->uio_resid;
	- int left = uiop->uio_resid;
	+ left = uiop->uio_resid;
	+ bzero((char *)bp->b_data + nread, left);

	- if (error != 0) {
	- printf("FUSE: Fix broken io: offset %ju, "
	- " resid %zd, file size %ju/%ju\n",
	- (uintmax_t)uiop->uio_offset,
	- uiop->uio_resid, fvdat->filesize,
	- fvdat->cached_attrs.va_size);
	- error = 0;
	+ if ((fvdat->flag & FN_SIZECHANGE) == 0) {
	+ /*
	+ * A short read with no error, when not using
	+ * direct io, and when no writes are cached,
	+ * indicates EOF caused by a server-side
	+ * truncation. Clear the attr cache so we'll
	+ * pick up the new file size and timestamps.
	+ *
	+ * We must still bzero the remaining buffer so
	+ * uninitialized data doesn't get exposed by a
	+ * future truncate that extends the file.
	+ *
	+ * To prevent lock order problems, we must
	+ * truncate the file upstack, not here.
	+ */
	+ SDT_PROBE2(fusefs, , io, trace, 1,
	+ "Short read of a clean file");
	+ fuse_vnode_clear_attr_cache(vp);
	+ } else {
	+ /*
	+ * If dirty writes _are_ cached beyond EOF,
	+ * that indicates a newly created hole that the
	+ * server doesn't know about. Those don't pose
	+ * any problem.
	+ * XXX: we don't currently track whether dirty
	+ * writes are cached beyond EOF, before EOF, or
	+ * both.
	+ */
	+ SDT_PROBE2(fusefs, , io, trace, 1,
	+ "Short read of a dirty file");
	+ uiop->uio_resid = 0;
	}
	- if (left > 0)
	- bzero((char *)bp->b_data + nread, left);
	- uiop->uio_resid = 0;
	+
	}
	if (error) {
	bp->b_ioflags \|= BIO_ERROR;
	@@ -714,33 +985,33 @@
	}
	} else {
	/*
	- * If we only need to commit, try to commit
	- */
	- if (bp->b_flags & B_NEEDCOMMIT) {
	- SDT_PROBE2(fuse, , io, trace, 1,
	- "write: B_NEEDCOMMIT flags set");
	- }
	- /*
	* Setup for actual write
	*/
	- if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend >
	- fvdat->filesize)
	- bp->b_dirtyend = fvdat->filesize -
	- (off_t)bp->b_blkno * biosize;
	+ error = fuse_vnode_size(vp, &filesize, cred, curthread);
	+ if (error) {
	+ bp->b_ioflags \|= BIO_ERROR;
	+ bp->b_error = error;
	+ bufdone(bp);
	+ return (error);
	+ }

	+ if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize)
	+ bp->b_dirtyend = filesize -
	+ (off_t)bp->b_lblkno * biosize;
	+
	if (bp->b_dirtyend > bp->b_dirtyoff) {
	io.iov_len = uiop->uio_resid = bp->b_dirtyend
	- bp->b_dirtyoff;
	- uiop->uio_offset = (off_t)bp->b_blkno * biosize
	+ uiop->uio_offset = (off_t)bp->b_lblkno * biosize
	+ bp->b_dirtyoff;
	io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
	uiop->uio_rw = UIO_WRITE;

	- error = fuse_write_directbackend(vp, uiop, cred, fufh, 0);
	+ bool pages = bp->b_flags & B_FUSEFS_WRITE_CACHE;
	+ error = fuse_write_directbackend(vp, uiop, cred, fufh,
	+ filesize, 0, pages);

	- if (error == EINTR \|\| error == ETIMEDOUT
	- \|\| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
	-
	+ if (error == EINTR \|\| error == ETIMEDOUT) {
	bp->b_flags &= ~(B_INVAL \| B_NOCACHE);
	if ((bp->b_flags & B_PAGING) == 0) {
	bdirty(bp);
	Index: sys/fs/fuse/fuse_ipc.h
	===================================================================
	--- sys/fs/fuse/fuse_ipc.h
	+++ sys/fs/fuse/fuse_ipc.h
	@@ -32,6 +32,11 @@
	*
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	+ *
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	@@ -63,6 +68,12 @@
	#include <sys/param.h>
	#include <sys/refcount.h>

	+enum fuse_data_cache_mode {
	+ FUSE_CACHE_UC,
	+ FUSE_CACHE_WT,
	+ FUSE_CACHE_WB,
	+};
	+
	struct fuse_iov {
	void *base;
	size_t len;
	@@ -103,6 +114,12 @@
	struct fuse_data *tk_data;
	int tk_flag;
	u_int tk_refcount;
	+ /*
	+ * If this ticket's operation has been interrupted, this will hold the
	+ * unique value of the FUSE_INTERRUPT operation. Otherwise, it will be
	+ * 0.
	+ */
	+ uint64_t irq_unique;

	/* fields for initiating an upgoing message */
	struct fuse_iov tk_ms_fiov;
	@@ -147,16 +164,20 @@
	ftick->tk_flag \|= FT_ANSW;
	}

	+static inline struct fuse_in_header*
	+fticket_in_header(struct fuse_ticket *ftick)
	+{
	+ return (struct fuse_in_header *)(ftick->tk_ms_fiov.base);
	+}
	+
	static inline enum fuse_opcode
	fticket_opcode(struct fuse_ticket *ftick)
	{
	- return (((struct fuse_in_header *)(ftick->tk_ms_fiov.base))->opcode);
	+ return fticket_in_header(ftick)->opcode;
	}

	int fticket_pull(struct fuse_ticket ftick, struct uio uio);

	-enum mountpri { FM_NOMOUNTED, FM_PRIMARY, FM_SECONDARY };
	-
	/*
	* The data representing a FUSE session.
	*/
	@@ -170,10 +191,16 @@

	struct mtx ms_mtx;
	STAILQ_HEAD(, fuse_ticket) ms_head;
	+ int ms_count;

	struct mtx aw_mtx;
	TAILQ_HEAD(, fuse_ticket) aw_head;

	+ /*
	+ * Holds the next value of the FUSE operation unique value.
	+ * Also, serves as a wakeup channel to prevent any operations from
	+ * being created before INIT completes.
	+ */
	u_long ticketer;

	struct sx rename_lock;
	@@ -181,6 +208,7 @@
	uint32_t fuse_libabi_major;
	uint32_t fuse_libabi_minor;

	+ uint32_t max_readahead_blocks;
	uint32_t max_write;
	uint32_t max_read;
	uint32_t subtype;
	@@ -189,34 +217,27 @@
	struct selinfo ks_rsel;

	int daemon_timeout;
	+ unsigned time_gran;
	uint64_t notimpl;
	+ uint64_t mnt_flag;
	+ enum fuse_data_cache_mode cache_mode;
	};

	#define FSESS_DEAD 0x0001 /* session is to be closed */
	-#define FSESS_UNUSED0 0x0002 /* unused */
	#define FSESS_INITED 0x0004 /* session has been inited */
	#define FSESS_DAEMON_CAN_SPY 0x0010 /* let non-owners access this fs */
	/* (and being observed by the daemon) */
	#define FSESS_PUSH_SYMLINKS_IN 0x0020 /* prefix absolute symlinks with mp */
	#define FSESS_DEFAULT_PERMISSIONS 0x0040 /* kernel does permission checking */
	-#define FSESS_NO_ATTRCACHE 0x0080 /* no attribute caching */
	-#define FSESS_NO_READAHEAD 0x0100 /* no readaheads */
	-#define FSESS_NO_DATACACHE 0x0200 /* disable buffer cache */
	-#define FSESS_NO_NAMECACHE 0x0400 /* disable name cache */
	-#define FSESS_NO_MMAP 0x0800 /* disable mmap */
	-#define FSESS_BROKENIO 0x1000 /* fix broken io */
	+#define FSESS_ASYNC_READ 0x1000 /* allow multiple reads of some file */
	+#define FSESS_POSIX_LOCKS 0x2000 /* daemon supports POSIX locks */
	+#define FSESS_EXPORT_SUPPORT 0x10000 /* daemon supports NFS-style lookups */
	+#define FSESS_INTR 0x20000 /* interruptible mounts */
	+#define FSESS_MNTOPTS_MASK ( \
	+ FSESS_DAEMON_CAN_SPY \| FSESS_PUSH_SYMLINKS_IN \| \
	+ FSESS_DEFAULT_PERMISSIONS \| FSESS_INTR)

	-enum fuse_data_cache_mode {
	- FUSE_CACHE_UC,
	- FUSE_CACHE_WT,
	- FUSE_CACHE_WB,
	-};
	-
	extern int fuse_data_cache_mode;
	-extern int fuse_data_cache_invalidate;
	-extern int fuse_mmap_enable;
	-extern int fuse_sync_resize;
	-extern int fuse_fix_broken_io;

	static inline struct fuse_data *
	fuse_get_mpdata(struct mount *mp)
	@@ -245,36 +266,43 @@
	{
	struct fuse_data *data = fuse_get_mpdata(mp);

	- return (fuse_data_cache_mode != FUSE_CACHE_UC &&
	- (data->dataflags & FSESS_NO_DATACACHE) == 0);
	+ return (data->cache_mode != FUSE_CACHE_UC);
	}

	static inline bool
	fsess_opt_mmap(struct mount *mp)
	{
	- struct fuse_data *data = fuse_get_mpdata(mp);
	-
	- if (!fuse_mmap_enable \|\| fuse_data_cache_mode == FUSE_CACHE_UC)
	- return (false);
	- return ((data->dataflags & (FSESS_NO_DATACACHE \| FSESS_NO_MMAP)) == 0);
	+ return (fsess_opt_datacache(mp));
	}

	static inline bool
	-fsess_opt_brokenio(struct mount *mp)
	+fsess_opt_writeback(struct mount *mp)
	{
	struct fuse_data *data = fuse_get_mpdata(mp);

	- return (fuse_fix_broken_io \|\| (data->dataflags & FSESS_BROKENIO));
	+ return (data->cache_mode == FUSE_CACHE_WB);
	}

	+/* Insert a new upgoing message */
	static inline void
	fuse_ms_push(struct fuse_ticket *ftick)
	{
	mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED);
	refcount_acquire(&ftick->tk_refcount);
	STAILQ_INSERT_TAIL(&ftick->tk_data->ms_head, ftick, tk_ms_link);
	+ ftick->tk_data->ms_count++;
	}

	+/* Insert a new upgoing message to the front of the queue */
	+static inline void
	+fuse_ms_push_head(struct fuse_ticket *ftick)
	+{
	+ mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED);
	+ refcount_acquire(&ftick->tk_refcount);
	+ STAILQ_INSERT_HEAD(&ftick->tk_data->ms_head, ftick, tk_ms_link);
	+ ftick->tk_data->ms_count++;
	+}
	+
	static inline struct fuse_ticket *
	fuse_ms_pop(struct fuse_data *data)
	{
	@@ -284,7 +312,9 @@

	if ((ftick = STAILQ_FIRST(&data->ms_head))) {
	STAILQ_REMOVE_HEAD(&data->ms_head, tk_ms_link);
	+ data->ms_count--;
	#ifdef INVARIANTS
	+ MPASS(data->ms_count >= 0);
	ftick->tk_ms_link.stqe_next = NULL;
	#endif
	}
	@@ -327,7 +357,7 @@
	struct fuse_ticket fuse_ticket_fetch(struct fuse_data data);
	int fuse_ticket_drop(struct fuse_ticket *ftick);
	void fuse_insert_callback(struct fuse_ticket ftick, fuse_handler_t handler);
	-void fuse_insert_message(struct fuse_ticket *ftick);
	+void fuse_insert_message(struct fuse_ticket *ftick, bool irq);

	static inline bool
	fuse_libabi_geq(struct fuse_data *data, uint32_t abi_maj, uint32_t abi_min)
	@@ -374,13 +404,15 @@
	#endif
	}

	+void fdisp_refresh(struct fuse_dispatcher *fdip);
	+
	void fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op,
	struct mount mp, uint64_t nid, struct thread td, struct ucred *cred);

	-void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
	- struct mount mp, uint64_t nid, pid_t pid, struct ucred cred);
	-
	void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
	+ struct vnode vp, struct thread td, struct ucred *cred);
	+
	+void fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
	struct vnode vp, struct thread td, struct ucred *cred);

	int fdisp_wait_answ(struct fuse_dispatcher *fdip);
	Index: sys/fs/fuse/fuse_ipc.c
	===================================================================
	--- sys/fs/fuse/fuse_ipc.c
	+++ sys/fs/fuse/fuse_ipc.c
	@@ -33,6 +33,11 @@
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	*
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	+ *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	@@ -61,6 +66,7 @@
	#include <sys/param.h>
	#include <sys/module.h>
	#include <sys/systm.h>
	+#include <sys/counter.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/conf.h>
	@@ -84,14 +90,17 @@
	#include "fuse_ipc.h"
	#include "fuse_internal.h"

	-SDT_PROVIDER_DECLARE(fuse);
	+SDT_PROVIDER_DECLARE(fusefs);
	/*
	* Fuse trace probe:
	* arg0: verbosity. Higher numbers give more verbose messages
	* arg1: Textual message
	*/
	-SDT_PROBE_DEFINE2(fuse, , ipc, trace, "int", "char*");
	+SDT_PROBE_DEFINE2(fusefs, , ipc, trace, "int", "char*");

	+static void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
	+ struct fuse_data data, uint64_t nid, pid_t pid, struct ucred cred);
	+static void fuse_interrupt_send(struct fuse_ticket *otick, int err);
	static struct fuse_ticket fticket_alloc(struct fuse_data data);
	static void fticket_refresh(struct fuse_ticket *ftick);
	static void fticket_destroy(struct fuse_ticket *ftick);
	@@ -104,13 +113,10 @@

	static fuse_handler_t fuse_standard_handler;

	-SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW, 0, "FUSE tunables");
	-SYSCTL_STRING(_vfs_fusefs, OID_AUTO, version, CTLFLAG_RD,
	- FUSE_FREEBSD_VERSION, 0, "fuse-freebsd version");
	-static int fuse_ticket_count = 0;
	+static counter_u64_t fuse_ticket_count;
	+SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, ticket_count, CTLFLAG_RD,
	+ &fuse_ticket_count, "Number of allocated tickets");

	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, ticket_count, CTLFLAG_RW,
	- &fuse_ticket_count, 0, "number of allocated tickets");
	static long fuse_iov_permanent_bufsize = 1 << 19;

	SYSCTL_LONG(_vfs_fusefs, OID_AUTO, iov_permanent_bufsize, CTLFLAG_RW,
	@@ -125,25 +131,131 @@
	MALLOC_DEFINE(M_FUSEMSG, "fuse_msgbuf", "fuse message buffer");
	static uma_zone_t ticket_zone;

	-static void
	-fuse_block_sigs(sigset_t *oldset)
	+/*
	+ * TODO: figure out how to timeout INTERRUPT requests, because the daemon may
	+ * leagally never respond
	+ */
	+static int
	+fuse_interrupt_callback(struct fuse_ticket tick, struct uio uio)
	{
	- sigset_t newset;
	+ struct fuse_ticket otick, x_tick;
	+ struct fuse_interrupt_in *fii;
	+ struct fuse_data *data = tick->tk_data;
	+ bool found = false;

	- SIGFILLSET(newset);
	- SIGDELSET(newset, SIGKILL);
	- if (kern_sigprocmask(curthread, SIG_BLOCK, &newset, oldset, 0))
	- panic("%s: Invalid operation for kern_sigprocmask()",
	- __func__);
	+ fii = (struct fuse_interrupt_in)((char)tick->tk_ms_fiov.base +
	+ sizeof(struct fuse_in_header));
	+
	+ fuse_lck_mtx_lock(data->aw_mtx);
	+ TAILQ_FOREACH_SAFE(otick, &data->aw_head, tk_aw_link, x_tick) {
	+ if (otick->tk_unique == fii->unique) {
	+ found = true;
	+ break;
	+ }
	+ }
	+ fuse_lck_mtx_unlock(data->aw_mtx);
	+
	+ if (!found) {
	+ /* Original is already complete. Just return */
	+ return 0;
	+ }
	+
	+ /* Clear the original ticket's interrupt association */
	+ otick->irq_unique = 0;
	+
	+ if (tick->tk_aw_ohead.error == ENOSYS) {
	+ fsess_set_notimpl(data->mp, FUSE_INTERRUPT);
	+ return 0;
	+ } else if (tick->tk_aw_ohead.error == EAGAIN) {
	+ /*
	+ * There are two reasons we might get this:
	+ * 1) the daemon received the INTERRUPT request before the
	+ * original, or
	+ * 2) the daemon received the INTERRUPT request after it
	+ * completed the original request.
	+ * In the first case we should re-send the INTERRUPT. In the
	+ * second, we should ignore it.
	+ */
	+ /* Resend */
	+ fuse_interrupt_send(otick, EINTR);
	+ return 0;
	+ } else {
	+ /* Illegal FUSE_INTERRUPT response */
	+ return EINVAL;
	+ }
	}

	-static void
	-fuse_restore_sigs(sigset_t *oldset)
	+/* Interrupt the operation otick. Return err as its error code */
	+void
	+fuse_interrupt_send(struct fuse_ticket *otick, int err)
	{
	+ struct fuse_dispatcher fdi;
	+ struct fuse_interrupt_in *fii;
	+ struct fuse_in_header *ftick_hdr;
	+ struct fuse_data *data = otick->tk_data;
	+ struct fuse_ticket tick, xtick;
	+ struct ucred reused_creds;
	+ gid_t reused_groups[1];

	- if (kern_sigprocmask(curthread, SIG_SETMASK, oldset, NULL, 0))
	- panic("%s: Invalid operation for kern_sigprocmask()",
	- __func__);
	+ if (otick->irq_unique == 0) {
	+ /*
	+ * If the daemon hasn't yet received otick, then we can answer
	+ * it ourselves and return.
	+ */
	+ fuse_lck_mtx_lock(data->ms_mtx);
	+ STAILQ_FOREACH_SAFE(tick, &otick->tk_data->ms_head, tk_ms_link,
	+ xtick) {
	+ if (tick == otick) {
	+ STAILQ_REMOVE(&otick->tk_data->ms_head, tick,
	+ fuse_ticket, tk_ms_link);
	+ otick->tk_data->ms_count--;
	+ otick->tk_ms_link.stqe_next = NULL;
	+ fuse_lck_mtx_unlock(data->ms_mtx);
	+
	+ fuse_lck_mtx_lock(otick->tk_aw_mtx);
	+ if (!fticket_answered(otick)) {
	+ fticket_set_answered(otick);
	+ otick->tk_aw_errno = err;
	+ wakeup(otick);
	+ }
	+ fuse_lck_mtx_unlock(otick->tk_aw_mtx);
	+
	+ fuse_ticket_drop(tick);
	+ return;
	+ }
	+ }
	+ fuse_lck_mtx_unlock(data->ms_mtx);
	+
	+ /*
	+ * If the fuse daemon doesn't support interrupts, then there's
	+ * nothing more that we can do
	+ */
	+ if (!fsess_isimpl(data->mp, FUSE_INTERRUPT))
	+ return;
	+
	+ /*
	+ * If the fuse daemon has already received otick, then we must
	+ * send FUSE_INTERRUPT.
	+ */
	+ ftick_hdr = fticket_in_header(otick);
	+ reused_creds.cr_uid = ftick_hdr->uid;
	+ reused_groups[0] = ftick_hdr->gid;
	+ reused_creds.cr_groups = reused_groups;
	+ fdisp_init(&fdi, sizeof(*fii));
	+ fdisp_make_pid(&fdi, FUSE_INTERRUPT, data, ftick_hdr->nodeid,
	+ ftick_hdr->pid, &reused_creds);
	+
	+ fii = fdi.indata;
	+ fii->unique = otick->tk_unique;
	+ fuse_insert_callback(fdi.tick, fuse_interrupt_callback);
	+
	+ otick->irq_unique = fdi.tick->tk_unique;
	+ /* Interrupt ops should be delivered ASAP */
	+ fuse_insert_message(fdi.tick, true);
	+ fdisp_destroy(&fdi);
	+ } else {
	+ /* This ticket has already been interrupted */
	+ }
	}

	void
	@@ -181,14 +293,19 @@
	}
	fiov->allocated_size = FU_AT_LEAST(size);
	fiov->credit = fuse_iov_credit;
	+ /* Clear data buffer after reallocation */
	+ bzero(fiov->base, size);
	+ } else if (size > fiov->len) {
	+ /* Clear newly extended portion of data buffer */
	+ bzero((char*)fiov->base + fiov->len, size - fiov->len);
	}
	fiov->len = size;
	}

	+/* Resize the fiov if needed, and clear it's buffer */
	void
	fiov_refresh(struct fuse_iov *fiov)
	{
	- bzero(fiov->base, fiov->len);
	fiov_adjust(fiov, 0);
	}

	@@ -211,8 +328,10 @@
	if (ftick->tk_unique == 0)
	ftick->tk_unique = atomic_fetchadd_long(&data->ticketer, 1);

	+ ftick->irq_unique = 0;
	+
	refcount_init(&ftick->tk_refcount, 1);
	- atomic_add_acq_int(&fuse_ticket_count, 1);
	+ counter_u64_add(fuse_ticket_count, 1);

	return 0;
	}
	@@ -227,7 +346,7 @@
	FUSE_ASSERT_MS_DONE(ftick);
	FUSE_ASSERT_AW_DONE(ftick);

	- atomic_subtract_acq_int(&fuse_ticket_count, 1);
	+ counter_u64_add(fuse_ticket_count, -1);
	}

	static int
	@@ -269,7 +388,7 @@
	return uma_zfree(ticket_zone, ftick);
	}

	-static inline
	+static inline
	void
	fticket_refresh(struct fuse_ticket *ftick)
	{
	@@ -292,30 +411,65 @@
	ftick->tk_flag = 0;
	}

	+/* Prepar the ticket to be reused, but don't clear its data buffers */
	+static inline void
	+fticket_reset(struct fuse_ticket *ftick)
	+{
	+ FUSE_ASSERT_MS_DONE(ftick);
	+ FUSE_ASSERT_AW_DONE(ftick);
	+
	+ ftick->tk_ms_bufdata = NULL;
	+ ftick->tk_ms_bufsize = 0;
	+ ftick->tk_ms_type = FT_M_FIOV;
	+
	+ bzero(&ftick->tk_aw_ohead, sizeof(struct fuse_out_header));
	+
	+ ftick->tk_aw_errno = 0;
	+ ftick->tk_aw_bufdata = NULL;
	+ ftick->tk_aw_bufsize = 0;
	+ ftick->tk_aw_type = FT_A_FIOV;
	+
	+ ftick->tk_flag = 0;
	+}
	+
	static int
	fticket_wait_answer(struct fuse_ticket *ftick)
	{
	- sigset_t tset;
	- int err = 0;
	- struct fuse_data *data;
	+ struct thread *td = curthread;
	+ sigset_t blockedset, oldset;
	+ int err = 0, stops_deferred;
	+ struct fuse_data *data = ftick->tk_data;
	+ bool interrupted = false;

	+ if (fsess_isimpl(ftick->tk_data->mp, FUSE_INTERRUPT) &&
	+ data->dataflags & FSESS_INTR) {
	+ SIGEMPTYSET(blockedset);
	+ } else {
	+ /* Block all signals except (implicitly) SIGKILL */
	+ SIGFILLSET(blockedset);
	+ }
	+ stops_deferred = sigdeferstop(SIGDEFERSTOP_SILENT);
	+ kern_sigprocmask(td, SIG_BLOCK, NULL, &oldset, 0);
	+
	fuse_lck_mtx_lock(ftick->tk_aw_mtx);

	+retry:
	if (fticket_answered(ftick)) {
	goto out;
	}
	- data = ftick->tk_data;

	if (fdata_get_dead(data)) {
	err = ENOTCONN;
	fticket_set_answered(ftick);
	goto out;
	}
	- fuse_block_sigs(&tset);
	+ kern_sigprocmask(td, SIG_BLOCK, &blockedset, NULL, 0);
	err = msleep(ftick, &ftick->tk_aw_mtx, PCATCH, "fu_ans",
	data->daemon_timeout * hz);
	- fuse_restore_sigs(&tset);
	- if (err == EAGAIN) { /* same as EWOULDBLOCK */
	+ kern_sigprocmask(td, SIG_SETMASK, &oldset, NULL, 0);
	+ if (err == EWOULDBLOCK) {
	+ SDT_PROBE2(fusefs, , ipc, trace, 3,
	+ "fticket_wait_answer: EWOULDBLOCK");
	#ifdef XXXIP /* die conditionally */
	if (!fdata_get_dead(data)) {
	fdata_set_dead(data);
	@@ -323,14 +477,64 @@
	#endif
	err = ETIMEDOUT;
	fticket_set_answered(ftick);
	+ } else if ((err == EINTR \|\| err == ERESTART)) {
	+ /*
	+ * Whether we get EINTR or ERESTART depends on whether
	+ * SA_RESTART was set by sigaction(2).
	+ *
	+ * Try to interrupt the operation and wait for an EINTR response
	+ * to the original operation. If the file system does not
	+ * support FUSE_INTERRUPT, then we'll just wait for it to
	+ * complete like normal. If it does support FUSE_INTERRUPT,
	+ * then it will either respond EINTR to the original operation,
	+ * or EAGAIN to the interrupt.
	+ */
	+ sigset_t tmpset;
	+
	+ SDT_PROBE2(fusefs, , ipc, trace, 4,
	+ "fticket_wait_answer: interrupt");
	+ fuse_lck_mtx_unlock(ftick->tk_aw_mtx);
	+ fuse_interrupt_send(ftick, err);
	+
	+ PROC_LOCK(td->td_proc);
	+ mtx_lock(&td->td_proc->p_sigacts->ps_mtx);
	+ tmpset = td->td_proc->p_siglist;
	+ SIGSETOR(tmpset, td->td_siglist);
	+ mtx_unlock(&td->td_proc->p_sigacts->ps_mtx);
	+ PROC_UNLOCK(td->td_proc);
	+
	+ fuse_lck_mtx_lock(ftick->tk_aw_mtx);
	+ if (!interrupted && !SIGISMEMBER(tmpset, SIGKILL)) {
	+ /*
	+ * Block all signals while we wait for an interrupt
	+ * response. The protocol doesn't discriminate between
	+ * different signals.
	+ */
	+ SIGFILLSET(blockedset);
	+ interrupted = true;
	+ goto retry;
	+ } else {
	+ /*
	+ * Return immediately for fatal signals, or if this is
	+ * the second interruption. We should only be
	+ * interrupted twice if the thread is stopped, for
	+ * example during sigexit.
	+ */
	+ }
	+ } else if (err) {
	+ SDT_PROBE2(fusefs, , ipc, trace, 6,
	+ "fticket_wait_answer: other error");
	+ } else {
	+ SDT_PROBE2(fusefs, , ipc, trace, 7, "fticket_wait_answer: OK");
	}
	out:
	if (!(err \|\| fticket_answered(ftick))) {
	- SDT_PROBE2(fuse, , ipc, trace, 1,
	+ SDT_PROBE2(fusefs, , ipc, trace, 1,
	"FUSE: requester was woken up but still no answer");
	err = ENXIO;
	}
	fuse_lck_mtx_unlock(ftick->tk_aw_mtx);
	+ sigallowstop(stops_deferred);

	return err;
	}
	@@ -386,6 +590,8 @@
	data->fdev = fdev;
	mtx_init(&data->ms_mtx, "fuse message list mutex", NULL, MTX_DEF);
	STAILQ_INIT(&data->ms_head);
	+ data->ms_count = 0;
	+ knlist_init_mtx(&data->ks_rsel.si_note, &data->ms_mtx);
	mtx_init(&data->aw_mtx, "fuse answer list mutex", NULL, MTX_DEF);
	TAILQ_INIT(&data->aw_head);
	data->daemoncred = crhold(cred);
	@@ -405,11 +611,12 @@
	return;

	/* Driving off stage all that stuff thrown at device... */
	- mtx_destroy(&data->ms_mtx);
	- mtx_destroy(&data->aw_mtx);
	sx_destroy(&data->rename_lock);
	-
	crfree(data->daemoncred);
	+ mtx_destroy(&data->aw_mtx);
	+ knlist_delete(&data->ks_rsel.si_note, curthread, 0);
	+ knlist_destroy(&data->ks_rsel.si_note);
	+ mtx_destroy(&data->ms_mtx);

	free(data, M_FUSEMSG);
	}
	@@ -478,8 +685,14 @@
	fuse_lck_mtx_unlock(ftick->tk_data->aw_mtx);
	}

	+/*
	+ * Insert a new upgoing ticket into the message queue
	+ *
	+ * If urgent is true, insert at the front of the queue. Otherwise, insert in
	+ * FIFO order.
	+ */
	void
	-fuse_insert_message(struct fuse_ticket *ftick)
	+fuse_insert_message(struct fuse_ticket *ftick, bool urgent)
	{
	if (ftick->tk_flag & FT_DIRTY) {
	panic("FUSE: ticket reused without being refreshed");
	@@ -490,9 +703,13 @@
	return;
	}
	fuse_lck_mtx_lock(ftick->tk_data->ms_mtx);
	- fuse_ms_push(ftick);
	+ if (urgent)
	+ fuse_ms_push_head(ftick);
	+ else
	+ fuse_ms_push(ftick);
	wakeup_one(ftick->tk_data);
	selwakeuppri(&ftick->tk_data->ks_rsel, PZERO + 1);
	+ KNOTE_LOCKED(&ftick->tk_data->ks_rsel.si_note, 0);
	fuse_lck_mtx_unlock(ftick->tk_data->ms_mtx);
	}

	@@ -505,8 +722,21 @@
	opcode = fticket_opcode(ftick);

	switch (opcode) {
	+ case FUSE_BMAP:
	+ err = (blen == sizeof(struct fuse_bmap_out)) ? 0 : EINVAL;
	+ break;
	+
	+ case FUSE_LINK:
	case FUSE_LOOKUP:
	- err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
	+ case FUSE_MKDIR:
	+ case FUSE_MKNOD:
	+ case FUSE_SYMLINK:
	+ if (fuse_libabi_geq(ftick->tk_data, 7, 9)) {
	+ err = (blen == sizeof(struct fuse_entry_out)) ?
	+ 0 : EINVAL;
	+ } else {
	+ err = (blen == FUSE_COMPAT_ENTRY_OUT_SIZE) ? 0 : EINVAL;
	+ }
	break;

	case FUSE_FORGET:
	@@ -514,29 +744,19 @@
	break;

	case FUSE_GETATTR:
	- err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL;
	- break;
	-
	case FUSE_SETATTR:
	- err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL;
	+ if (fuse_libabi_geq(ftick->tk_data, 7, 9)) {
	+ err = (blen == sizeof(struct fuse_attr_out)) ?
	+ 0 : EINVAL;
	+ } else {
	+ err = (blen == FUSE_COMPAT_ATTR_OUT_SIZE) ? 0 : EINVAL;
	+ }
	break;

	case FUSE_READLINK:
	err = (PAGE_SIZE >= blen) ? 0 : EINVAL;
	break;

	- case FUSE_SYMLINK:
	- err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
	- break;
	-
	- case FUSE_MKNOD:
	- err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
	- break;
	-
	- case FUSE_MKDIR:
	- err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
	- break;
	-
	case FUSE_UNLINK:
	err = (blen == 0) ? 0 : EINVAL;
	break;
	@@ -549,10 +769,6 @@
	err = (blen == 0) ? 0 : EINVAL;
	break;

	- case FUSE_LINK:
	- err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL;
	- break;
	-
	case FUSE_OPEN:
	err = (blen == sizeof(struct fuse_open_out)) ? 0 : EINVAL;
	break;
	@@ -607,7 +823,9 @@
	break;

	case FUSE_INIT:
	- if (blen == sizeof(struct fuse_init_out) \|\| blen == 8) {
	+ if (blen == sizeof(struct fuse_init_out) \|\|
	+ blen == FUSE_COMPAT_INIT_OUT_SIZE \|\|
	+ blen == FUSE_COMPAT_22_INIT_OUT_SIZE) {
	err = 0;
	} else {
	err = EINVAL;
	@@ -634,15 +852,15 @@
	break;

	case FUSE_GETLK:
	- panic("FUSE: no response body format check for FUSE_GETLK");
	+ err = (blen == sizeof(struct fuse_lk_out)) ? 0 : EINVAL;
	break;

	case FUSE_SETLK:
	- panic("FUSE: no response body format check for FUSE_SETLK");
	+ err = (blen == 0) ? 0 : EINVAL;
	break;

	case FUSE_SETLKW:
	- panic("FUSE: no response body format check for FUSE_SETLKW");
	+ err = (blen == 0) ? 0 : EINVAL;
	break;

	case FUSE_ACCESS:
	@@ -650,8 +868,13 @@
	break;

	case FUSE_CREATE:
	- err = (blen == sizeof(struct fuse_entry_out) +
	- sizeof(struct fuse_open_out)) ? 0 : EINVAL;
	+ if (fuse_libabi_geq(ftick->tk_data, 7, 9)) {
	+ err = (blen == sizeof(struct fuse_entry_out) +
	+ sizeof(struct fuse_open_out)) ? 0 : EINVAL;
	+ } else {
	+ err = (blen == FUSE_COMPAT_ENTRY_OUT_SIZE +
	+ sizeof(struct fuse_open_out)) ? 0 : EINVAL;
	+ }
	break;

	case FUSE_DESTROY:
	@@ -677,7 +900,7 @@

	ihead->pid = pid;
	ihead->uid = cred->cr_uid;
	- ihead->gid = cred->cr_rgid;
	+ ihead->gid = cred->cr_groups[0];
	}

	/*
	@@ -705,18 +928,38 @@
	return err;
	}

	-void
	-fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
	+/*
	+ * Reinitialize a dispatcher from a pid and node id, without resizing or
	+ * clearing its data buffers
	+ */
	+static void
	+fdisp_refresh_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
	struct mount mp, uint64_t nid, pid_t pid, struct ucred cred)
	{
	- struct fuse_data *data = fuse_get_mpdata(mp);
	+ MPASS(fdip->tick);
	+ MPASS2(sizeof(fdip->finh) + fdip->iosize <= fdip->tick->tk_ms_fiov.len,
	+ "Must use fdisp_make_pid to increase the size of the fiov");
	+ fticket_reset(fdip->tick);

	+ FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh,
	+ fdip->indata, fdip->iosize);
	+
	+ fuse_setup_ihead(fdip->finh, fdip->tick, nid, op, fdip->iosize, pid,
	+ cred);
	+}
	+
	+/* Initialize a dispatcher from a pid and node id */
	+static void
	+fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op,
	+ struct fuse_data data, uint64_t nid, pid_t pid, struct ucred cred)
	+{
	if (fdip->tick) {
	fticket_refresh(fdip->tick);
	} else {
	fdip->tick = fuse_ticket_fetch(data);
	}

	+ /* FUSE_DIMALLOC will bzero the fiovs when it enlarges them */
	FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh,
	fdip->indata, fdip->iosize);

	@@ -727,22 +970,42 @@
	fdisp_make(struct fuse_dispatcher fdip, enum fuse_opcode op, struct mount mp,
	uint64_t nid, struct thread td, struct ucred cred)
	{
	+ struct fuse_data *data = fuse_get_mpdata(mp);
	RECTIFY_TDCR(td, cred);

	- return fdisp_make_pid(fdip, op, mp, nid, td->td_proc->p_pid, cred);
	+ return fdisp_make_pid(fdip, op, data, nid, td->td_proc->p_pid, cred);
	}

	void
	fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
	struct vnode vp, struct thread td, struct ucred *cred)
	{
	+ struct mount *mp = vnode_mount(vp);
	+ struct fuse_data *data = fuse_get_mpdata(mp);
	+
	RECTIFY_TDCR(td, cred);
	- return fdisp_make_pid(fdip, op, vnode_mount(vp), VTOI(vp),
	+ return fdisp_make_pid(fdip, op, data, VTOI(vp),
	td->td_proc->p_pid, cred);
	}

	-SDT_PROBE_DEFINE2(fuse, , ipc, fdisp_wait_answ_error, "char*", "int");
	+/* Refresh a fuse_dispatcher so it can be reused, but don't zero its data */
	+void
	+fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op,
	+ struct vnode vp, struct thread td, struct ucred *cred)
	+{
	+ RECTIFY_TDCR(td, cred);
	+ return fdisp_refresh_pid(fdip, op, vnode_mount(vp), VTOI(vp),
	+ td->td_proc->p_pid, cred);
	+}

	+void
	+fdisp_refresh(struct fuse_dispatcher *fdip)
	+{
	+ fticket_refresh(fdip->tick);
	+}
	+
	+SDT_PROBE_DEFINE2(fusefs, , ipc, fdisp_wait_answ_error, "char*", "int");
	+
	int
	fdisp_wait_answ(struct fuse_dispatcher *fdip)
	{
	@@ -750,7 +1013,7 @@

	fdip->answ_stat = 0;
	fuse_insert_callback(fdip->tick, fuse_standard_handler);
	- fuse_insert_message(fdip->tick);
	+ fuse_insert_message(fdip->tick, false);

	if ((err = fticket_wait_answer(fdip->tick))) {
	fuse_lck_mtx_lock(fdip->tick->tk_aw_mtx);
	@@ -761,7 +1024,7 @@
	* the standard handler has completed his job.
	* So we drop the ticket and exit as usual.
	*/
	- SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
	+ SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error,
	"IPC: interrupted, already answered", err);
	fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx);
	goto out;
	@@ -771,7 +1034,7 @@
	* Then by setting the answered flag we get him
	* to drop the ticket.
	*/
	- SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
	+ SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error,
	"IPC: interrupted, setting to answered", err);
	fticket_set_answered(fdip->tick);
	fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx);
	@@ -779,14 +1042,22 @@
	}
	}

	- if (fdip->tick->tk_aw_errno) {
	- SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
	+ if (fdip->tick->tk_aw_errno == ENOTCONN) {
	+ /* The daemon died while we were waiting for a response */
	+ err = ENOTCONN;
	+ goto out;
	+ } else if (fdip->tick->tk_aw_errno) {
	+ /*
	+ * There was some sort of communication error with the daemon
	+ * that the client wouldn't understand.
	+ */
	+ SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error,
	"IPC: explicit EIO-ing", fdip->tick->tk_aw_errno);
	err = EIO;
	goto out;
	}
	if ((err = fdip->tick->tk_aw_ohead.error)) {
	- SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error,
	+ SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error,
	"IPC: setting status", fdip->tick->tk_aw_ohead.error);
	/*
	* This means a "proper" fuse syscall error.
	@@ -815,10 +1086,13 @@
	ticket_zone = uma_zcreate("fuse_ticket", sizeof(struct fuse_ticket),
	fticket_ctor, fticket_dtor, fticket_init, fticket_fini,
	UMA_ALIGN_PTR, 0);
	+ fuse_ticket_count = counter_u64_alloc(M_WAITOK);
	+ counter_u64_zero(fuse_ticket_count);
	}

	void
	fuse_ipc_destroy(void)
	{
	+ counter_u64_free(fuse_ticket_count);
	uma_zdestroy(ticket_zone);
	}
	Index: sys/fs/fuse/fuse_kernel.h
	===================================================================
	--- sys/fs/fuse/fuse_kernel.h
	+++ sys/fs/fuse/fuse_kernel.h
	@@ -1,6 +1,6 @@
	/*--
	* This file defines the kernel interface of FUSE
	- * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
	+ * Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
	*
	* This program can be distributed under the terms of the GNU GPL.
	* See the file COPYING.
	@@ -34,69 +34,134 @@
	* $FreeBSD$
	*/

	-#ifndef linux
	-#include <sys/types.h>
	-#define __u64 uint64_t
	-#define __u32 uint32_t
	-#define __s32 int32_t
	+/*
	+ * This file defines the kernel interface of FUSE
	+ *
	+ * Protocol changelog:
	+ *
	+ * 7.9:
	+ * - new fuse_getattr_in input argument of GETATTR
	+ * - add lk_flags in fuse_lk_in
	+ * - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in
	+ * - add blksize field to fuse_attr
	+ * - add file flags field to fuse_read_in and fuse_write_in
	+ *
	+ * 7.10
	+ * - add nonseekable open flag
	+ *
	+ * 7.11
	+ * - add IOCTL message
	+ * - add unsolicited notification support
	+ *
	+ * 7.12
	+ * - add umask flag to input argument of open, mknod and mkdir
	+ * - add notification messages for invalidation of inodes and
	+ * directory entries
	+ *
	+ * 7.13
	+ * - make max number of background requests and congestion threshold
	+ * tunables
	+ *
	+ * 7.14
	+ * - add splice support to fuse device
	+ *
	+ * 7.15
	+ * - add store notify
	+ * - add retrieve notify
	+ *
	+ * 7.16
	+ * - add BATCH_FORGET request
	+ * - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct
	+ * fuse_ioctl_iovec' instead of ambiguous 'struct iovec'
	+ * - add FUSE_IOCTL_32BIT flag
	+ *
	+ * 7.17
	+ * - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK
	+ *
	+ * 7.18
	+ * - add FUSE_IOCTL_DIR flag
	+ * - add FUSE_NOTIFY_DELETE
	+ *
	+ * 7.19
	+ * - add FUSE_FALLOCATE
	+ *
	+ * 7.20
	+ * - add FUSE_AUTO_INVAL_DATA
	+ * 7.21
	+ * - add FUSE_READDIRPLUS
	+ * - send the requested events in POLL request
	+ *
	+ * 7.22
	+ * - add FUSE_ASYNC_DIO
	+ *
	+ * 7.23
	+ * - add FUSE_WRITEBACK_CACHE
	+ * - add time_gran to fuse_init_out
	+ * - add reserved space to fuse_init_out
	+ * - add FATTR_CTIME
	+ * - add ctime and ctimensec to fuse_setattr_in
	+ * - add FUSE_RENAME2 request
	+ * - add FUSE_NO_OPEN_SUPPORT flag
	+ */
	+
	+#ifndef _FUSE_FUSE_KERNEL_H
	+#define _FUSE_FUSE_KERNEL_H
	+
	+#ifdef __linux__
	+#include <linux/types.h>
	#else
	-#include <asm/types.h>
	-#include <linux/major.h>
	+#include <sys/types.h>
	#endif

	/** Version number of this interface */
	#define FUSE_KERNEL_VERSION 7

	/** Minor version number of this interface */
	-#define FUSE_KERNEL_MINOR_VERSION 8
	+#define FUSE_KERNEL_MINOR_VERSION 23

	/** The node ID of the root inode */
	#define FUSE_ROOT_ID 1

	-/** The major number of the fuse character device */
	-#define FUSE_MAJOR MISC_MAJOR
	-
	-/** The minor number of the fuse character device */
	-#define FUSE_MINOR 229
	-
	/* Make sure all structures are padded to 64bit boundary, so 32bit
	userspace works under 64bit kernels */

	struct fuse_attr {
	- __u64 ino;
	- __u64 size;
	- __u64 blocks;
	- __u64 atime;
	- __u64 mtime;
	- __u64 ctime;
	- __u32 atimensec;
	- __u32 mtimensec;
	- __u32 ctimensec;
	- __u32 mode;
	- __u32 nlink;
	- __u32 uid;
	- __u32 gid;
	- __u32 rdev;
	+ uint64_t ino;
	+ uint64_t size;
	+ uint64_t blocks;
	+ uint64_t atime;
	+ uint64_t mtime;
	+ uint64_t ctime;
	+ uint32_t atimensec;
	+ uint32_t mtimensec;
	+ uint32_t ctimensec;
	+ uint32_t mode;
	+ uint32_t nlink;
	+ uint32_t uid;
	+ uint32_t gid;
	+ uint32_t rdev;
	+ uint32_t blksize;
	+ uint32_t padding;
	};

	struct fuse_kstatfs {
	- __u64 blocks;
	- __u64 bfree;
	- __u64 bavail;
	- __u64 files;
	- __u64 ffree;
	- __u32 bsize;
	- __u32 namelen;
	- __u32 frsize;
	- __u32 padding;
	- __u32 spare[6];
	+ uint64_t blocks;
	+ uint64_t bfree;
	+ uint64_t bavail;
	+ uint64_t files;
	+ uint64_t ffree;
	+ uint32_t bsize;
	+ uint32_t namelen;
	+ uint32_t frsize;
	+ uint32_t padding;
	+ uint32_t spare[6];
	};

	struct fuse_file_lock {
	- __u64 start;
	- __u64 end;
	- __u32 type;
	- __u32 pid; /* tgid */
	+ uint64_t start;
	+ uint64_t end;
	+ uint32_t type;
	+ uint32_t pid; /* tgid */
	};

	/**
	@@ -109,27 +174,128 @@
	#define FATTR_ATIME (1 << 4)
	#define FATTR_MTIME (1 << 5)
	#define FATTR_FH (1 << 6)
	+#define FATTR_ATIME_NOW (1 << 7)
	+#define FATTR_MTIME_NOW (1 << 8)
	+#define FATTR_LOCKOWNER (1 << 9)
	+#define FATTR_CTIME (1 << 10)

	/**
	* Flags returned by the OPEN request
	*
	* FOPEN_DIRECT_IO: bypass page cache for this open file
	* FOPEN_KEEP_CACHE: don't invalidate the data cache on open
	+ * FOPEN_NONSEEKABLE: the file is not seekable
	*/
	#define FOPEN_DIRECT_IO (1 << 0)
	#define FOPEN_KEEP_CACHE (1 << 1)
	+#define FOPEN_NONSEEKABLE (1 << 2)

	/**
	* INIT request/reply flags
	+ *
	+ * FUSE_ASYNC_READ: asynchronous read requests
	+ * FUSE_POSIX_LOCKS: remote locking for POSIX file locks
	+ * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported)
	+ * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem
	+ * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".."
	+ * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB
	+ * FUSE_DONT_MASK: don't apply umask to file mode on create operations
	+ * FUSE_SPLICE_WRITE: kernel supports splice write on the device
	+ * FUSE_SPLICE_MOVE: kernel supports splice move on the device
	+ * FUSE_SPLICE_READ: kernel supports splice read on the device
	+ * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks
	+ * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories
	+ * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages
	+ * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one)
	+ * FUSE_READDIRPLUS_AUTO: adaptive readdirplus
	+ * FUSE_ASYNC_DIO: asynchronous direct I/O submission
	+ * FUSE_WRITEBACK_CACHE: use writeback cache for buffered writes
	+ * FUSE_NO_OPEN_SUPPORT: kernel supports zero-message opens
	*/
	#define FUSE_ASYNC_READ (1 << 0)
	#define FUSE_POSIX_LOCKS (1 << 1)
	+#define FUSE_FILE_OPS (1 << 2)
	+#define FUSE_ATOMIC_O_TRUNC (1 << 3)
	+#define FUSE_EXPORT_SUPPORT (1 << 4)
	+#define FUSE_BIG_WRITES (1 << 5)
	+#define FUSE_DONT_MASK (1 << 6)
	+#define FUSE_SPLICE_WRITE (1 << 7)
	+#define FUSE_SPLICE_MOVE (1 << 8)
	+#define FUSE_SPLICE_READ (1 << 9)
	+#define FUSE_FLOCK_LOCKS (1 << 10)
	+#define FUSE_HAS_IOCTL_DIR (1 << 11)
	+#define FUSE_AUTO_INVAL_DATA (1 << 12)
	+#define FUSE_DO_READDIRPLUS (1 << 13)
	+#define FUSE_READDIRPLUS_AUTO (1 << 14)
	+#define FUSE_ASYNC_DIO (1 << 15)
	+#define FUSE_WRITEBACK_CACHE (1 << 16)
	+#define FUSE_NO_OPEN_SUPPORT (1 << 17)

	+#ifdef linux
	/**
	+ * CUSE INIT request/reply flags
	+ *
	+ * CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl
	+ */
	+#define CUSE_UNRESTRICTED_IOCTL (1 << 0)
	+#endif /* linux */
	+
	+/**
	* Release flags
	*/
	#define FUSE_RELEASE_FLUSH (1 << 0)
	+#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1)

	+/**
	+ * Getattr flags
	+ */
	+#define FUSE_GETATTR_FH (1 << 0)
	+
	+/**
	+ * Lock flags
	+ */
	+#define FUSE_LK_FLOCK (1 << 0)
	+
	+/**
	+ * WRITE flags
	+ *
	+ * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed
	+ * FUSE_WRITE_LOCKOWNER: lock_owner field is valid
	+ */
	+#define FUSE_WRITE_CACHE (1 << 0)
	+#define FUSE_WRITE_LOCKOWNER (1 << 1)
	+
	+/**
	+ * Read flags
	+ */
	+#define FUSE_READ_LOCKOWNER (1 << 1)
	+
	+/**
	+ * Ioctl flags
	+ *
	+ * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
	+ * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
	+ * FUSE_IOCTL_RETRY: retry with new iovecs
	+ * FUSE_IOCTL_32BIT: 32bit ioctl
	+ * FUSE_IOCTL_DIR: is a directory
	+ *
	+ * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
	+ */
	+#define FUSE_IOCTL_COMPAT (1 << 0)
	+#define FUSE_IOCTL_UNRESTRICTED (1 << 1)
	+#define FUSE_IOCTL_RETRY (1 << 2)
	+#define FUSE_IOCTL_32BIT (1 << 3)
	+#define FUSE_IOCTL_DIR (1 << 4)
	+
	+#define FUSE_IOCTL_MAX_IOV 256
	+
	+/**
	+ * Poll flags
	+ *
	+ * FUSE_POLL_SCHEDULE_NOTIFY: request poll notify
	+ */
	+#define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0)
	+
	enum fuse_opcode {
	FUSE_LOOKUP = 1,
	FUSE_FORGET = 2, /* no reply */
	@@ -167,107 +333,179 @@
	FUSE_INTERRUPT = 36,
	FUSE_BMAP = 37,
	FUSE_DESTROY = 38,
	+ FUSE_IOCTL = 39,
	+ FUSE_POLL = 40,
	+ FUSE_NOTIFY_REPLY = 41,
	+ FUSE_BATCH_FORGET = 42,
	+ FUSE_FALLOCATE = 43,
	+ FUSE_READDIRPLUS = 44,
	+ FUSE_RENAME2 = 45,
	+
	+#ifdef linux
	+ /* CUSE specific operations */
	+ CUSE_INIT = 4096,
	+#endif /* linux */
	};

	+enum fuse_notify_code {
	+ FUSE_NOTIFY_POLL = 1,
	+ FUSE_NOTIFY_INVAL_INODE = 2,
	+ FUSE_NOTIFY_INVAL_ENTRY = 3,
	+ FUSE_NOTIFY_STORE = 4,
	+ FUSE_NOTIFY_RETRIEVE = 5,
	+ FUSE_NOTIFY_DELETE = 6,
	+ FUSE_NOTIFY_CODE_MAX,
	+};
	+
	/* The read buffer is required to be at least 8k, but may be much larger */
	#define FUSE_MIN_READ_BUFFER 8192

	+#define FUSE_COMPAT_ENTRY_OUT_SIZE 120
	+
	struct fuse_entry_out {
	- __u64 nodeid; /* Inode ID */
	- __u64 generation; /* Inode generation: nodeid:gen must
	- be unique for the fs's lifetime */
	- __u64 entry_valid; /* Cache timeout for the name */
	- __u64 attr_valid; /* Cache timeout for the attributes */
	- __u32 entry_valid_nsec;
	- __u32 attr_valid_nsec;
	+ uint64_t nodeid; /* Inode ID */
	+ uint64_t generation; /* Inode generation: nodeid:gen must
	+ be unique for the fs's lifetime */
	+ uint64_t entry_valid; /* Cache timeout for the name */
	+ uint64_t attr_valid; /* Cache timeout for the attributes */
	+ uint32_t entry_valid_nsec;
	+ uint32_t attr_valid_nsec;
	struct fuse_attr attr;
	};

	struct fuse_forget_in {
	- __u64 nlookup;
	+ uint64_t nlookup;
	};

	+struct fuse_forget_one {
	+ uint64_t nodeid;
	+ uint64_t nlookup;
	+};
	+
	+struct fuse_batch_forget_in {
	+ uint32_t count;
	+ uint32_t dummy;
	+};
	+
	+struct fuse_getattr_in {
	+ uint32_t getattr_flags;
	+ uint32_t dummy;
	+ uint64_t fh;
	+};
	+
	+#define FUSE_COMPAT_ATTR_OUT_SIZE 96
	+
	struct fuse_attr_out {
	- __u64 attr_valid; /* Cache timeout for the attributes */
	- __u32 attr_valid_nsec;
	- __u32 dummy;
	+ uint64_t attr_valid; /* Cache timeout for the attributes */
	+ uint32_t attr_valid_nsec;
	+ uint32_t dummy;
	struct fuse_attr attr;
	};

	+#define FUSE_COMPAT_MKNOD_IN_SIZE 8
	+
	+struct fuse_mknod_in {
	+ uint32_t mode;
	+ uint32_t rdev;
	+ uint32_t umask;
	+ uint32_t padding;
	+};
	+
	struct fuse_mkdir_in {
	- __u32 mode;
	- __u32 padding;
	+ uint32_t mode;
	+ uint32_t umask;
	};

	struct fuse_rename_in {
	- __u64 newdir;
	+ uint64_t newdir;
	};

	+struct fuse_rename2_in {
	+ uint64_t newdir;
	+ uint32_t flags;
	+ uint32_t padding;
	+};
	+
	struct fuse_link_in {
	- __u64 oldnodeid;
	+ uint64_t oldnodeid;
	};

	struct fuse_setattr_in {
	- __u32 valid;
	- __u32 padding;
	- __u64 fh;
	- __u64 size;
	- __u64 unused1;
	- __u64 atime;
	- __u64 mtime;
	- __u64 unused2;
	- __u32 atimensec;
	- __u32 mtimensec;
	- __u32 unused3;
	- __u32 mode;
	- __u32 unused4;
	- __u32 uid;
	- __u32 gid;
	- __u32 unused5;
	+ uint32_t valid;
	+ uint32_t padding;
	+ uint64_t fh;
	+ uint64_t size;
	+ uint64_t lock_owner;
	+ uint64_t atime;
	+ uint64_t mtime;
	+ uint64_t ctime;
	+ uint32_t atimensec;
	+ uint32_t mtimensec;
	+ uint32_t ctimensec;
	+ uint32_t mode;
	+ uint32_t unused4;
	+ uint32_t uid;
	+ uint32_t gid;
	+ uint32_t unused5;
	};

	struct fuse_open_in {
	- __u32 flags;
	- __u32 mode;
	+ uint32_t flags;
	+ uint32_t unused;
	};

	+struct fuse_create_in {
	+ uint32_t flags;
	+ uint32_t mode;
	+ uint32_t umask;
	+ uint32_t padding;
	+};
	+
	struct fuse_open_out {
	- __u64 fh;
	- __u32 open_flags;
	- __u32 padding;
	+ uint64_t fh;
	+ uint32_t open_flags;
	+ uint32_t padding;
	};

	struct fuse_release_in {
	- __u64 fh;
	- __u32 flags;
	- __u32 release_flags;
	- __u64 lock_owner;
	+ uint64_t fh;
	+ uint32_t flags;
	+ uint32_t release_flags;
	+ uint64_t lock_owner;
	};

	struct fuse_flush_in {
	- __u64 fh;
	- __u32 unused;
	- __u32 padding;
	- __u64 lock_owner;
	+ uint64_t fh;
	+ uint32_t unused;
	+ uint32_t padding;
	+ uint64_t lock_owner;
	};

	struct fuse_read_in {
	- __u64 fh;
	- __u64 offset;
	- __u32 size;
	- __u32 padding;
	+ uint64_t fh;
	+ uint64_t offset;
	+ uint32_t size;
	+ uint32_t read_flags;
	+ uint64_t lock_owner;
	+ uint32_t flags;
	+ uint32_t padding;
	};

	+#define FUSE_COMPAT_WRITE_IN_SIZE 24
	+
	struct fuse_write_in {
	- __u64 fh;
	- __u64 offset;
	- __u32 size;
	- __u32 write_flags;
	+ uint64_t fh;
	+ uint64_t offset;
	+ uint32_t size;
	+ uint32_t write_flags;
	+ uint64_t lock_owner;
	+ uint32_t flags;
	+ uint32_t padding;
	};

	struct fuse_write_out {
	- __u32 size;
	- __u32 padding;
	+ uint32_t size;
	+ uint32_t padding;
	};

	#define FUSE_COMPAT_STATFS_SIZE 48
	@@ -277,40 +515,42 @@
	};

	struct fuse_fsync_in {
	- __u64 fh;
	- __u32 fsync_flags;
	- __u32 padding;
	+ uint64_t fh;
	+ uint32_t fsync_flags;
	+ uint32_t padding;
	};

	+struct fuse_setxattr_in {
	+ uint32_t size;
	+ uint32_t flags;
	+};
	+
	struct fuse_listxattr_in {
	- __u32 size;
	- __u32 flags;
	+ uint32_t size;
	+ uint32_t padding;
	};

	struct fuse_listxattr_out {
	- __u32 size;
	- __u32 flags;
	+ uint32_t size;
	+ uint32_t padding;
	};

	struct fuse_getxattr_in {
	- __u32 size;
	- __u32 padding;
	+ uint32_t size;
	+ uint32_t padding;
	};

	struct fuse_getxattr_out {
	- __u32 size;
	- __u32 padding;
	+ uint32_t size;
	+ uint32_t padding;
	};

	-struct fuse_setxattr_in {
	- __u32 size;
	- __u32 flags;
	-};
	-
	struct fuse_lk_in {
	- __u64 fh;
	- __u64 owner;
	+ uint64_t fh;
	+ uint64_t owner;
	struct fuse_file_lock lk;
	+ uint32_t lk_flags;
	+ uint32_t padding;
	};

	struct fuse_lk_out {
	@@ -318,66 +558,197 @@
	};

	struct fuse_access_in {
	- __u32 mask;
	- __u32 padding;
	+ uint32_t mask;
	+ uint32_t padding;
	};

	struct fuse_init_in {
	- __u32 major;
	- __u32 minor;
	- __u32 max_readahead;
	- __u32 flags;
	+ uint32_t major;
	+ uint32_t minor;
	+ uint32_t max_readahead;
	+ uint32_t flags;
	};

	+#define FUSE_COMPAT_INIT_OUT_SIZE 8
	+#define FUSE_COMPAT_22_INIT_OUT_SIZE 24
	+
	struct fuse_init_out {
	- __u32 major;
	- __u32 minor;
	- __u32 max_readahead;
	- __u32 flags;
	- __u32 unused;
	- __u32 max_write;
	+ uint32_t major;
	+ uint32_t minor;
	+ uint32_t max_readahead;
	+ uint32_t flags;
	+ uint16_t max_background;
	+ uint16_t congestion_threshold;
	+ uint32_t max_write;
	+ uint32_t time_gran;
	+ uint32_t unused[9];
	};

	+#ifdef linux
	+#define CUSE_INIT_INFO_MAX 4096
	+
	+struct cuse_init_in {
	+ uint32_t major;
	+ uint32_t minor;
	+ uint32_t unused;
	+ uint32_t flags;
	+};
	+
	+struct cuse_init_out {
	+ uint32_t major;
	+ uint32_t minor;
	+ uint32_t unused;
	+ uint32_t flags;
	+ uint32_t max_read;
	+ uint32_t max_write;
	+ uint32_t dev_major; /* chardev major */
	+ uint32_t dev_minor; /* chardev minor */
	+ uint32_t spare[10];
	+};
	+#endif /* linux */
	+
	struct fuse_interrupt_in {
	- __u64 unique;
	+ uint64_t unique;
	};

	struct fuse_bmap_in {
	- __u64 block;
	- __u32 blocksize;
	- __u32 padding;
	+ uint64_t block;
	+ uint32_t blocksize;
	+ uint32_t padding;
	};

	struct fuse_bmap_out {
	- __u64 block;
	+ uint64_t block;
	};

	+struct fuse_ioctl_in {
	+ uint64_t fh;
	+ uint32_t flags;
	+ uint32_t cmd;
	+ uint64_t arg;
	+ uint32_t in_size;
	+ uint32_t out_size;
	+};
	+
	+struct fuse_ioctl_iovec {
	+ uint64_t base;
	+ uint64_t len;
	+};
	+
	+struct fuse_ioctl_out {
	+ int32_t result;
	+ uint32_t flags;
	+ uint32_t in_iovs;
	+ uint32_t out_iovs;
	+};
	+
	+struct fuse_poll_in {
	+ uint64_t fh;
	+ uint64_t kh;
	+ uint32_t flags;
	+ uint32_t events;
	+};
	+
	+struct fuse_poll_out {
	+ uint32_t revents;
	+ uint32_t padding;
	+};
	+
	+struct fuse_notify_poll_wakeup_out {
	+ uint64_t kh;
	+};
	+
	+struct fuse_fallocate_in {
	+ uint64_t fh;
	+ uint64_t offset;
	+ uint64_t length;
	+ uint32_t mode;
	+ uint32_t padding;
	+};
	+
	struct fuse_in_header {
	- __u32 len;
	- __u32 opcode;
	- __u64 unique;
	- __u64 nodeid;
	- __u32 uid;
	- __u32 gid;
	- __u32 pid;
	- __u32 padding;
	+ uint32_t len;
	+ uint32_t opcode;
	+ uint64_t unique;
	+ uint64_t nodeid;
	+ uint32_t uid;
	+ uint32_t gid;
	+ uint32_t pid;
	+ uint32_t padding;
	};

	struct fuse_out_header {
	- __u32 len;
	- __s32 error;
	- __u64 unique;
	+ uint32_t len;
	+ int32_t error;
	+ uint64_t unique;
	};

	struct fuse_dirent {
	- __u64 ino;
	- __u64 off;
	- __u32 namelen;
	- __u32 type;
	- char name[0];
	+ uint64_t ino;
	+ uint64_t off;
	+ uint32_t namelen;
	+ uint32_t type;
	+ char name[];
	};

	#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
	-#define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1))
	+#define FUSE_DIRENT_ALIGN(x) \
	+ (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1))
	#define FUSE_DIRENT_SIZE(d) \
	FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
	+
	+struct fuse_direntplus {
	+ struct fuse_entry_out entry_out;
	+ struct fuse_dirent dirent;
	+};
	+
	+#define FUSE_NAME_OFFSET_DIRENTPLUS \
	+ offsetof(struct fuse_direntplus, dirent.name)
	+#define FUSE_DIRENTPLUS_SIZE(d) \
	+ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen)
	+
	+struct fuse_notify_inval_inode_out {
	+ uint64_t ino;
	+ int64_t off;
	+ int64_t len;
	+};
	+
	+struct fuse_notify_inval_entry_out {
	+ uint64_t parent;
	+ uint32_t namelen;
	+ uint32_t padding;
	+};
	+
	+struct fuse_notify_delete_out {
	+ uint64_t parent;
	+ uint64_t child;
	+ uint32_t namelen;
	+ uint32_t padding;
	+};
	+
	+struct fuse_notify_store_out {
	+ uint64_t nodeid;
	+ uint64_t offset;
	+ uint32_t size;
	+ uint32_t padding;
	+};
	+
	+struct fuse_notify_retrieve_out {
	+ uint64_t notify_unique;
	+ uint64_t nodeid;
	+ uint64_t offset;
	+ uint32_t size;
	+ uint32_t padding;
	+};
	+
	+/* Matches the size of fuse_write_in */
	+struct fuse_notify_retrieve_in {
	+ uint64_t dummy1;
	+ uint64_t offset;
	+ uint32_t size;
	+ uint32_t dummy2;
	+ uint64_t dummy3;
	+ uint64_t dummy4;
	+};
	+
	+#endif /* _FUSE_FUSE_KERNEL_H */
	Index: sys/fs/fuse/fuse_main.c
	===================================================================
	--- sys/fs/fuse/fuse_main.c
	+++ sys/fs/fuse/fuse_main.c
	@@ -33,6 +33,11 @@
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	*
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	+ *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	@@ -77,6 +82,10 @@
	#include <sys/sysctl.h>

	#include "fuse.h"
	+#include "fuse_file.h"
	+#include "fuse_ipc.h"
	+#include "fuse_internal.h"
	+#include "fuse_node.h"

	static void fuse_bringdown(eventhandler_tag eh_tag);
	static int fuse_loader(struct module m, int what, void arg);
	@@ -85,7 +94,7 @@

	extern struct vfsops fuse_vfsops;
	extern struct cdevsw fuse_cdevsw;
	-extern struct vop_vector fuse_vnops;
	+extern struct vop_vector fuse_fifonops;
	extern uma_zone_t fuse_pbuf_zone;

	static struct vfsconf fuse_vfsconf = {
	@@ -96,11 +105,13 @@
	.vfc_flags = VFCF_JAIL \| VFCF_SYNTHETIC
	};

	+SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW, 0, "FUSE tunables");
	+SYSCTL_NODE(_vfs_fusefs, OID_AUTO, stats, CTLFLAG_RW, 0, "FUSE statistics");
	SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_major, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, FUSE_KERNEL_VERSION, "FUSE kernel abi major version");
	SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_minor, CTLFLAG_RD,
	SYSCTL_NULL_INT_PTR, FUSE_KERNEL_MINOR_VERSION, "FUSE kernel abi minor version");
	-SDT_PROVIDER_DEFINE(fuse);
	+SDT_PROVIDER_DEFINE(fusefs);

	/******************************
	*
	@@ -111,7 +122,9 @@
	static void
	fuse_bringdown(eventhandler_tag eh_tag)
	{
	-
	+ fuse_node_destroy();
	+ fuse_internal_destroy();
	+ fuse_file_destroy();
	fuse_ipc_destroy();
	fuse_device_destroy();
	mtx_destroy(&fuse_mtx);
	@@ -132,16 +145,14 @@
	return (err);
	}
	fuse_ipc_init();
	+ fuse_file_init();
	+ fuse_internal_init();
	+ fuse_node_init();
	fuse_pbuf_zone = pbuf_zsecond_create("fusepbuf", nswbuf / 2);

	/* vfs_modevent ignores its first arg */
	if ((err = vfs_modevent(NULL, what, &fuse_vfsconf)))
	fuse_bringdown(eh_tag);
	- else
	- printf("fuse-freebsd: version %s, FUSE ABI %d.%d\n",
	- FUSE_FREEBSD_VERSION,
	- FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
	-
	break;
	case MOD_UNLOAD:
	if ((err = vfs_modevent(NULL, what, &fuse_vfsconf)))
	Index: sys/fs/fuse/fuse_node.h
	===================================================================
	--- sys/fs/fuse/fuse_node.h
	+++ sys/fs/fuse/fuse_node.h
	@@ -32,6 +32,11 @@
	*
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	+ *
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	@@ -60,60 +65,121 @@
	#ifndef _FUSE_NODE_H_
	#define _FUSE_NODE_H_

	+#include <sys/fnv_hash.h>
	#include <sys/types.h>
	#include <sys/mutex.h>

	#include "fuse_file.h"

	-#define FN_REVOKED 0x00000020
	-#define FN_FLUSHINPROG 0x00000040
	-#define FN_FLUSHWANT 0x00000080
	-#define FN_SIZECHANGE 0x00000100
	-#define FN_DIRECTIO 0x00000200
	+#define FN_REVOKED 0x00000020
	+#define FN_FLUSHINPROG 0x00000040
	+#define FN_FLUSHWANT 0x00000080
	+/*
	+ * Indicates that the file's size is dirty; the kernel has changed it but not
	+ * yet send the change to the daemon. When this bit is set, the
	+ * cache_attrs.va_size field does not time out.
	+ */
	+#define FN_SIZECHANGE 0x00000100
	+#define FN_DIRECTIO 0x00000200
	+/* Indicates that parent_nid is valid */
	+#define FN_PARENT_NID 0x00000400

	+/*
	+ * Indicates that the file's cached timestamps are dirty. They will be flushed
	+ * during the next SETATTR or WRITE. Until then, the cached fields will not
	+ * time out.
	+ */
	+#define FN_MTIMECHANGE 0x00000800
	+#define FN_CTIMECHANGE 0x00001000
	+
	struct fuse_vnode_data {
	/ self /
	uint64_t nid;
	+ uint64_t generation;

	/ parent /
	- /* XXXIP very likely to be stale, it's not updated in rename() */
	uint64_t parent_nid;

	/ I/O /
	- struct fuse_filehandle fufh[FUFH_MAXTYPE];
	+ /* List of file handles for all of the vnode's open file descriptors */
	+ LIST_HEAD(, fuse_filehandle) handles;

	/ flags /
	uint32_t flag;

	/ meta /
	- bool valid_attr_cache;
	+ /* The monotonic time after which the attr cache is invalid */
	+ struct bintime attr_cache_timeout;
	+ /*
	+ * Monotonic time after which the entry is invalid. Used for lookups
	+ * by nodeid instead of pathname.
	+ */
	+ struct bintime entry_cache_timeout;
	struct vattr cached_attrs;
	- off_t filesize;
	uint64_t nlookup;
	enum vtype vtype;
	};

	+/*
	+ * This overlays the fid structure (see mount.h). Mostly the same as the types
	+ * used by UFS and ext2.
	+ */
	+struct fuse_fid {
	+ uint16_t len; /* Length of structure. */
	+ uint16_t pad; /* Force 32-bit alignment. */
	+ uint32_t gen; /* Generation number. */
	+ uint64_t nid; /* FUSE node id. */
	+};
	+
	#define VTOFUD(vp) \
	((struct fuse_vnode_data *)((vp)->v_data))
	#define VTOI(vp) (VTOFUD(vp)->nid)
	-#define VTOVA(vp) \
	- (VTOFUD(vp)->valid_attr_cache ? \
	- &(VTOFUD(vp)->cached_attrs) : NULL)
	+static inline struct vattr*
	+VTOVA(struct vnode *vp)
	+{
	+ struct bintime now;
	+
	+ getbinuptime(&now);
	+ if (bintime_cmp(&(VTOFUD(vp)->attr_cache_timeout), &now, >))
	+ return &(VTOFUD(vp)->cached_attrs);
	+ else
	+ return NULL;
	+}
	+
	+static inline void
	+fuse_vnode_clear_attr_cache(struct vnode *vp)
	+{
	+ bintime_clear(&VTOFUD(vp)->attr_cache_timeout);
	+}
	+
	+static uint32_t inline
	+fuse_vnode_hash(uint64_t id)
	+{
	+ return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT));
	+}
	+
	#define VTOILLU(vp) ((uint64_t)(VTOFUD(vp) ? VTOI(vp) : 0))

	#define FUSE_NULL_ID 0

	+extern struct vop_vector fuse_fifoops;
	extern struct vop_vector fuse_vnops;

	+int fuse_vnode_cmp(struct vnode vp, void nidp);
	+
	static inline void
	fuse_vnode_setparent(struct vnode vp, struct vnode dvp)
	{
	if (dvp != NULL && vp->v_type == VDIR) {
	MPASS(dvp->v_type == VDIR);
	VTOFUD(vp)->parent_nid = VTOI(dvp);
	+ VTOFUD(vp)->flag \|= FN_PARENT_NID;
	}
	}

	+int fuse_vnode_size(struct vnode vp, off_t filesize, struct ucred *cred,
	+ struct thread *td);
	+
	void fuse_vnode_destroy(struct vnode *vp);

	int fuse_vnode_get(struct mount mp, struct fuse_entry_out feo,
	@@ -123,10 +189,14 @@
	void fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags,
	struct thread *td);

	-void fuse_vnode_refreshsize(struct vnode vp, struct ucred cred);
	+int fuse_vnode_savesize(struct vnode vp, struct ucred cred, pid_t pid);

	-int fuse_vnode_savesize(struct vnode vp, struct ucred cred);
	-
	int fuse_vnode_setsize(struct vnode *vp, off_t newsize);

	+void fuse_vnode_undirty_cached_timestamps(struct vnode *vp);
	+
	+void fuse_vnode_update(struct vnode *vp, int flags);
	+
	+void fuse_node_init(void);
	+void fuse_node_destroy(void);
	#endif /* _FUSE_NODE_H_ */
	Index: sys/fs/fuse/fuse_node.c
	===================================================================
	--- sys/fs/fuse/fuse_node.c
	+++ sys/fs/fuse/fuse_node.c
	@@ -33,6 +33,11 @@
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	*
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	+ *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	@@ -59,8 +64,9 @@
	__FBSDID("$FreeBSD$");

	#include <sys/types.h>
	-#include <sys/module.h>
	#include <sys/systm.h>
	+#include <sys/counter.h>
	+#include <sys/module.h>
	#include <sys/errno.h>
	#include <sys/param.h>
	#include <sys/kernel.h>
	@@ -77,8 +83,8 @@
	#include <sys/mount.h>
	#include <sys/sysctl.h>
	#include <sys/fcntl.h>
	-#include <sys/fnv_hash.h>
	#include <sys/priv.h>
	+#include <sys/buf.h>
	#include <security/mac/mac_framework.h>
	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	@@ -89,65 +95,40 @@
	#include "fuse_io.h"
	#include "fuse_ipc.h"

	-SDT_PROVIDER_DECLARE(fuse);
	+SDT_PROVIDER_DECLARE(fusefs);
	/*
	* Fuse trace probe:
	* arg0: verbosity. Higher numbers give more verbose messages
	* arg1: Textual message
	*/
	-SDT_PROBE_DEFINE2(fuse, , node, trace, "int", "char*");
	+SDT_PROBE_DEFINE2(fusefs, , node, trace, "int", "char*");

	MALLOC_DEFINE(M_FUSEVN, "fuse_vnode", "fuse vnode private data");

	static int sysctl_fuse_cache_mode(SYSCTL_HANDLER_ARGS);

	-static int fuse_node_count = 0;
	+static counter_u64_t fuse_node_count;

	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, node_count, CTLFLAG_RD,
	- &fuse_node_count, 0, "Count of FUSE vnodes");
	+SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, node_count, CTLFLAG_RD,
	+ &fuse_node_count, "Count of FUSE vnodes");

	int fuse_data_cache_mode = FUSE_CACHE_WT;

	+/*
	+ * DEPRECATED
	+ * This sysctl is no longer needed as of fuse protocol 7.23. Individual
	+ * servers can select the cache behavior they need for each mountpoint:
	+ * - writethrough: the default
	+ * - writeback: set FUSE_WRITEBACK_CACHE in fuse_init_out.flags
	+ * - uncached: set FOPEN_DIRECT_IO for every file
	+ * The sysctl is retained primarily for use by jails supporting older FUSE
	+ * protocols. It may be removed entirely once FreeBSD 11.3 and 12.0 are EOL.
	+ */
	SYSCTL_PROC(_vfs_fusefs, OID_AUTO, data_cache_mode, CTLTYPE_INT\|CTLFLAG_RW,
	&fuse_data_cache_mode, 0, sysctl_fuse_cache_mode, "I",
	"Zero: disable caching of FUSE file data; One: write-through caching "
	"(default); Two: write-back caching (generally unsafe)");

	-int fuse_data_cache_invalidate = 0;
	-
	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, data_cache_invalidate, CTLFLAG_RW,
	- &fuse_data_cache_invalidate, 0,
	- "If non-zero, discard cached clean file data when there are no active file"
	- " users");
	-
	-int fuse_mmap_enable = 1;
	-
	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, mmap_enable, CTLFLAG_RW,
	- &fuse_mmap_enable, 0,
	- "If non-zero, and data_cache_mode is also non-zero, enable mmap(2) of "
	- "FUSE files");
	-
	-int fuse_refresh_size = 0;
	-
	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, refresh_size, CTLFLAG_RW,
	- &fuse_refresh_size, 0,
	- "If non-zero, and no dirty file extension data is buffered, fetch file "
	- "size before write operations");
	-
	-int fuse_sync_resize = 1;
	-
	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, sync_resize, CTLFLAG_RW,
	- &fuse_sync_resize, 0,
	- "If a cached write extended a file, inform FUSE filesystem of the changed"
	- "size immediately subsequent to the issued writes");
	-
	-int fuse_fix_broken_io = 0;
	-
	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, fix_broken_io, CTLFLAG_RW,
	- &fuse_fix_broken_io, 0,
	- "If non-zero, print a diagnostic warning if a userspace filesystem returns"
	- " EIO on reads of recently extended portions of files");
	-
	static int
	sysctl_fuse_cache_mode(SYSCTL_HANDLER_ARGS)
	{
	@@ -174,9 +155,8 @@
	fuse_vnode_init(struct vnode vp, struct fuse_vnode_data fvdat,
	uint64_t nodeid, enum vtype vtyp)
	{
	- int i;
	-
	fvdat->nid = nodeid;
	+ LIST_INIT(&fvdat->handles);
	vattr_null(&fvdat->cached_attrs);
	if (nodeid == FUSE_ROOT_ID) {
	vp->v_vflag \|= VV_ROOT;
	@@ -184,10 +164,7 @@
	vp->v_type = vtyp;
	vp->v_data = fvdat;

	- for (i = 0; i < FUFH_MAXTYPE; i++)
	- fvdat->fufh[i].fh_type = FUFH_INVALID;
	-
	- atomic_add_acq_int(&fuse_node_count, 1);
	+ counter_u64_add(fuse_node_count, 1);
	}

	void
	@@ -196,23 +173,21 @@
	struct fuse_vnode_data *fvdat = vp->v_data;

	vp->v_data = NULL;
	+ KASSERT(LIST_EMPTY(&fvdat->handles),
	+ ("Destroying fuse vnode with open files!"));
	free(fvdat, M_FUSEVN);

	- atomic_subtract_acq_int(&fuse_node_count, 1);
	+ counter_u64_add(fuse_node_count, -1);
	}

	-static int
	+int
	fuse_vnode_cmp(struct vnode vp, void nidp)
	{
	return (VTOI(vp) != ((uint64_t )nidp));
	}

	-static uint32_t inline
	-fuse_vnode_hash(uint64_t id)
	-{
	- return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT));
	-}
	-
	+SDT_PROBE_DEFINE3(fusefs, , node, stale_vnode, "struct vnode*", "enum vtype",
	+ "uint64_t");
	static int
	fuse_vnode_alloc(struct mount *mp,
	struct thread *td,
	@@ -220,10 +195,12 @@
	enum vtype vtyp,
	struct vnode **vpp)
	{
	+ struct fuse_data *data;
	struct fuse_vnode_data *fvdat;
	struct vnode *vp2;
	int err = 0;

	+ data = fuse_get_mpdata(mp);
	if (vtyp == VNON) {
	return EINVAL;
	}
	@@ -234,12 +211,34 @@
	return (err);

	if (*vpp) {
	- MPASS((vpp)->v_type == vtyp && (vpp)->v_data != NULL);
	- SDT_PROBE2(fuse, , node, trace, 1, "vnode taken from hash");
	+ if ((*vpp)->v_type != vtyp) {
	+ /*
	+ * STALE vnode! This probably indicates a buggy
	+ * server, but it could also be the result of a race
	+ * between FUSE_LOOKUP and another client's
	+ * FUSE_UNLINK/FUSE_CREATE
	+ */
	+ SDT_PROBE3(fusefs, , node, stale_vnode, *vpp, vtyp,
	+ nodeid);
	+ fuse_internal_vnode_disappear(*vpp);
	+ lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL);
	+ *vpp = NULL;
	+ return (EAGAIN);
	+ }
	+ MPASS((*vpp)->v_data != NULL);
	+ MPASS(VTOFUD(*vpp)->nid == nodeid);
	+ SDT_PROBE2(fusefs, , node, trace, 1, "vnode taken from hash");
	return (0);
	}
	fvdat = malloc(sizeof(*fvdat), M_FUSEVN, M_WAITOK \| M_ZERO);
	- err = getnewvnode("fuse", mp, &fuse_vnops, vpp);
	+ switch (vtyp) {
	+ case VFIFO:
	+ err = getnewvnode("fuse", mp, &fuse_fifoops, vpp);
	+ break;
	+ default:
	+ err = getnewvnode("fuse", mp, &fuse_vnops, vpp);
	+ break;
	+ }
	if (err) {
	free(fvdat, M_FUSEVN);
	return (err);
	@@ -249,14 +248,23 @@
	err = insmntque(*vpp, mp);
	ASSERT_VOP_ELOCKED(*vpp, "fuse_vnode_alloc");
	if (err) {
	+ lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL);
	free(fvdat, M_FUSEVN);
	*vpp = NULL;
	return (err);
	}
	+ /* Disallow async reads for fifos because UFS does. I don't know why */
	+ if (data->dataflags & FSESS_ASYNC_READ && vtyp != VFIFO)
	+ VN_LOCK_ASHARE(*vpp);
	+
	err = vfs_hash_insert(*vpp, fuse_vnode_hash(nodeid), LK_EXCLUSIVE,
	td, &vp2, fuse_vnode_cmp, &nodeid);
	- if (err)
	+ if (err) {
	+ lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL);
	+ free(fvdat, M_FUSEVN);
	+ *vpp = NULL;
	return (err);
	+ }
	if (vp2 != NULL) {
	*vpp = vp2;
	return (0);
	@@ -277,6 +285,11 @@
	enum vtype vtyp)
	{
	struct thread *td = (cnp != NULL ? cnp->cn_thread : curthread);
	+ /*
	+ * feo should only be NULL for the root directory, which (when libfuse
	+ * is used) always has generation 0
	+ */
	+ uint64_t generation = feo ? feo->generation : 0;
	int err = 0;

	err = fuse_vnode_alloc(mp, td, nodeid, vtyp, vpp);
	@@ -284,22 +297,28 @@
	return err;
	}
	if (dvp != NULL) {
	- MPASS((cnp->cn_flags & ISDOTDOT) == 0);
	- MPASS(!(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'));
	+ MPASS(cnp && (cnp->cn_flags & ISDOTDOT) == 0);
	+ MPASS(cnp &&
	+ !(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'));
	fuse_vnode_setparent(*vpp, dvp);
	}
	if (dvp != NULL && cnp != NULL && (cnp->cn_flags & MAKEENTRY) != 0 &&
	feo != NULL &&
	(feo->entry_valid != 0 \|\| feo->entry_valid_nsec != 0)) {
	+ struct timespec timeout;
	+
	ASSERT_VOP_LOCKED(*vpp, "fuse_vnode_get");
	ASSERT_VOP_LOCKED(dvp, "fuse_vnode_get");
	- cache_enter(dvp, *vpp, cnp);
	+
	+ fuse_validity_2_timespec(feo, &timeout);
	+ cache_enter_time(dvp, *vpp, cnp, &timeout, NULL);
	}

	+ VTOFUD(*vpp)->generation = generation;
	/*
	* In userland, libfuse uses cached lookups for dot and dotdot entries,
	* thus it does not really bump the nlookup counter for forget.
	- * Follow the same semantic and avoid tu bump it in order to keep
	+ * Follow the same semantic and avoid the bump in order to keep
	* nlookup counters consistent.
	*/
	if (cnp == NULL \|\| ((cnp->cn_flags & ISDOTDOT) == 0 &&
	@@ -309,44 +328,19 @@
	return 0;
	}

	+/*
	+ * Called for every fusefs vnode open to initialize the vnode (not
	+ * fuse_filehandle) for use
	+ */
	void
	fuse_vnode_open(struct vnode vp, int32_t fuse_open_flags, struct thread td)
	{
	- /*
	- * Funcation is called for every vnode open.
	- * Merge fuse_open_flags it may be 0
	- */
	- /*
	- * Ideally speaking, direct io should be enabled on
	- * fd's but do not see of any way of providing that
	- * this implementation.
	- *
	- * Also cannot think of a reason why would two
	- * different fd's on same vnode would like
	- * have DIRECT_IO turned on and off. But linux
	- * based implementation works on an fd not an
	- * inode and provides such a feature.
	- *
	- * XXXIP: Handle fd based DIRECT_IO
	- */
	- if (fuse_open_flags & FOPEN_DIRECT_IO) {
	- ASSERT_VOP_ELOCKED(vp, __func__);
	- VTOFUD(vp)->flag \|= FN_DIRECTIO;
	- fuse_io_invalbuf(vp, td);
	- } else {
	- if ((fuse_open_flags & FOPEN_KEEP_CACHE) == 0)
	- fuse_io_invalbuf(vp, td);
	- VTOFUD(vp)->flag &= ~FN_DIRECTIO;
	- }
	-
	- if (vnode_vtype(vp) == VREG) {
	- /* XXXIP prevent getattr, by using cached node size */
	+ if (vnode_vtype(vp) == VREG)
	vnode_create_vobject(vp, 0, td);
	- }
	}

	int
	-fuse_vnode_savesize(struct vnode vp, struct ucred cred)
	+fuse_vnode_savesize(struct vnode vp, struct ucred cred, pid_t pid)
	{
	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	struct thread *td = curthread;
	@@ -375,10 +369,11 @@
	fsai->valid = 0;

	/* Truncate to a new value. */
	- fsai->size = fvdat->filesize;
	+ MPASS((fvdat->flag & FN_SIZECHANGE) != 0);
	+ fsai->size = fvdat->cached_attrs.va_size;
	fsai->valid \|= FATTR_SIZE;

	- fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh);
	+ fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid);
	if (fufh) {
	fsai->fh = fufh->fh_id;
	fsai->valid \|= FATTR_FH;
	@@ -391,38 +386,116 @@
	return err;
	}

	-void
	-fuse_vnode_refreshsize(struct vnode vp, struct ucred cred)
	-{
	-
	- struct fuse_vnode_data *fvdat = VTOFUD(vp);
	- struct vattr va;
	-
	- if ((fvdat->flag & FN_SIZECHANGE) != 0 \|\|
	- fuse_data_cache_mode == FUSE_CACHE_UC \|\|
	- (fuse_refresh_size == 0 && fvdat->filesize != 0))
	- return;
	-
	- VOP_GETATTR(vp, &va, cred);
	- SDT_PROBE2(fuse, , node, trace, 1, "refreshed file size");
	-}
	-
	+/*
	+ * Adjust the vnode's size to a new value, such as that provided by
	+ * FUSE_GETATTR.
	+ */
	int
	fuse_vnode_setsize(struct vnode *vp, off_t newsize)
	{
	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	+ struct vattr *attrs;
	off_t oldsize;
	+ size_t iosize;
	+ struct buf *bp = NULL;
	int err = 0;

	ASSERT_VOP_ELOCKED(vp, "fuse_vnode_setsize");

	- oldsize = fvdat->filesize;
	- fvdat->filesize = newsize;
	- fvdat->flag \|= FN_SIZECHANGE;
	+ iosize = fuse_iosize(vp);
	+ oldsize = fvdat->cached_attrs.va_size;
	+ fvdat->cached_attrs.va_size = newsize;
	+ if ((attrs = VTOVA(vp)) != NULL)
	+ attrs->va_size = newsize;

	if (newsize < oldsize) {
	+ daddr_t lbn;
	+
	err = vtruncbuf(vp, newsize, fuse_iosize(vp));
	+ if (err)
	+ goto out;
	+ if (newsize % iosize == 0)
	+ goto out;
	+ /*
	+ * Zero the contents of the last partial block.
	+ * Sure seems like vtruncbuf should do this for us.
	+ */
	+
	+ lbn = newsize / iosize;
	+ bp = getblk(vp, lbn, iosize, PCATCH, 0, 0);
	+ if (!bp) {
	+ err = EINTR;
	+ goto out;
	+ }
	+ if (!(bp->b_flags & B_CACHE))
	+ goto out; /* Nothing to do */
	+ MPASS(bp->b_flags & B_VMIO);
	+ vfs_bio_clrbuf(bp);
	+ bp->b_dirtyend = MIN(bp->b_dirtyend, newsize - lbn * iosize);
	}
	+out:
	+ if (bp)
	+ brelse(bp);
	vnode_pager_setsize(vp, newsize);
	return err;
	+}
	+
	+/* Get the current, possibly dirty, size of the file */
	+int
	+fuse_vnode_size(struct vnode vp, off_t filesize, struct ucred *cred,
	+ struct thread *td)
	+{
	+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
	+ int error = 0;
	+
	+ if (!(fvdat->flag & FN_SIZECHANGE) &&
	+ (VTOVA(vp) == NULL \|\| fvdat->cached_attrs.va_size == VNOVAL))
	+ error = fuse_internal_do_getattr(vp, NULL, cred, td);
	+
	+ if (!error)
	+ *filesize = fvdat->cached_attrs.va_size;
	+
	+ return error;
	+}
	+
	+void
	+fuse_vnode_undirty_cached_timestamps(struct vnode *vp)
	+{
	+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
	+
	+ fvdat->flag &= ~(FN_MTIMECHANGE \| FN_CTIMECHANGE);
	+}
	+
	+/* Update a fuse file's cached timestamps */
	+void
	+fuse_vnode_update(struct vnode *vp, int flags)
	+{
	+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
	+ struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));
	+ struct timespec ts;
	+
	+ vfs_timestamp(&ts);
	+
	+ if (data->time_gran > 1)
	+ ts.tv_nsec = rounddown(ts.tv_nsec, data->time_gran);
	+
	+ if (flags & FN_MTIMECHANGE)
	+ fvdat->cached_attrs.va_mtime = ts;
	+ if (flags & FN_CTIMECHANGE)
	+ fvdat->cached_attrs.va_ctime = ts;
	+
	+ fvdat->flag \|= flags;
	+}
	+
	+void
	+fuse_node_init(void)
	+{
	+ fuse_node_count = counter_u64_alloc(M_WAITOK);
	+ counter_u64_zero(fuse_node_count);
	+}
	+
	+void
	+fuse_node_destroy(void)
	+{
	+ counter_u64_free(fuse_node_count);
	}
	Index: sys/fs/fuse/fuse_param.h
	===================================================================
	--- sys/fs/fuse/fuse_param.h
	+++ /dev/null
	@@ -1,82 +0,0 @@
	-/*-
	- * SPDX-License-Identifier: BSD-3-Clause
	- *
	- * Copyright (c) 2007-2009 Google Inc. and Amit Singh
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions are
	- * met:
	- *
	- * * Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * * Redistributions in binary form must reproduce the above
	- * copyright notice, this list of conditions and the following disclaimer
	- * in the documentation and/or other materials provided with the
	- * distribution.
	- * * Neither the name of Google Inc. nor the names of its
	- * contributors may be used to endorse or promote products derived from
	- * this software without specific prior written permission.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	- *
	- * $FreeBSD$
	- */
	-
	-#ifndef _FUSE_PARAM_H_
	-#define _FUSE_PARAM_H_
	-
	-/*
	- * This is the prefix ("fuse" by default) of the name of a FUSE device node
	- * in devfs. The suffix is the device number. "/dev/fuse0" is the first FUSE
	- * device by default. If you change the prefix from the default to something
	- * else, the user-space FUSE library will need to know about it too.
	- */
	-#define FUSE_DEVICE_BASENAME "fuse"
	-
	-/*
	- * This is the number of /dev/fuse<n> nodes we will create. <n> goes from
	- * 0 to (FUSE_NDEVICES - 1).
	- */
	-#define FUSE_NDEVICES 16
	-
	-/*
	- * This is the default block size of the virtual storage devices that are
	- * implicitly implemented by the FUSE kernel extension. This can be changed
	- * on a per-mount basis (there's one such virtual device for each mount).
	- */
	-#define FUSE_DEFAULT_BLOCKSIZE 4096
	-
	-/*
	- * This is default I/O size used while accessing the virtual storage devices.
	- * This can be changed on a per-mount basis.
	- */
	-#define FUSE_DEFAULT_IOSIZE 4096
	-
	-#ifdef KERNEL
	-
	-/*
	- * This is the soft upper limit on the number of "request tickets" FUSE's
	- * user-kernel IPC layer can have for a given mount. This can be modified
	- * through the fuse.* sysctl interface.
	- */
	-#define FUSE_DEFAULT_MAX_FREE_TICKETS 1024
	-
	-#define FUSE_DEFAULT_IOV_PERMANENT_BUFSIZE (1L << 19)
	-#define FUSE_DEFAULT_IOV_CREDIT 16
	-
	-#endif
	-
	-#define FUSE_LINK_MAX UINT32_MAX
	-
	-#endif /* _FUSE_PARAM_H_ */
	Index: sys/fs/fuse/fuse_vfsops.c
	===================================================================
	--- sys/fs/fuse/fuse_vfsops.c
	+++ sys/fs/fuse/fuse_vfsops.c
	@@ -33,6 +33,11 @@
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	*
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	+ *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	@@ -81,7 +86,6 @@
	#include <sys/fcntl.h>

	#include "fuse.h"
	-#include "fuse_param.h"
	#include "fuse_node.h"
	#include "fuse_ipc.h"
	#include "fuse_internal.h"
	@@ -89,13 +93,13 @@
	#include <sys/priv.h>
	#include <security/mac/mac_framework.h>

	-SDT_PROVIDER_DECLARE(fuse);
	+SDT_PROVIDER_DECLARE(fusefs);
	/*
	* Fuse trace probe:
	* arg0: verbosity. Higher numbers give more verbose messages
	* arg1: Textual message
	*/
	-SDT_PROBE_DEFINE2(fuse, , vfsops, trace, "int", "char*");
	+SDT_PROBE_DEFINE2(fusefs, , vfsops, trace, "int", "char*");

	/* This will do for privilege types for now */
	#ifndef PRIV_VFS_FUSE_ALLOWOTHER
	@@ -108,30 +112,28 @@
	#define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER
	#endif

	+static vfs_fhtovp_t fuse_vfsop_fhtovp;
	static vfs_mount_t fuse_vfsop_mount;
	static vfs_unmount_t fuse_vfsop_unmount;
	static vfs_root_t fuse_vfsop_root;
	static vfs_statfs_t fuse_vfsop_statfs;
	+static vfs_vget_t fuse_vfsop_vget;

	struct vfsops fuse_vfsops = {
	+ .vfs_fhtovp = fuse_vfsop_fhtovp,
	.vfs_mount = fuse_vfsop_mount,
	.vfs_unmount = fuse_vfsop_unmount,
	.vfs_root = fuse_vfsop_root,
	.vfs_statfs = fuse_vfsop_statfs,
	+ .vfs_vget = fuse_vfsop_vget,
	};

	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, init_backgrounded, CTLFLAG_RD,
	- SYSCTL_NULL_INT_PTR, 1, "indicate async handshake");
	static int fuse_enforce_dev_perms = 0;

	SYSCTL_INT(_vfs_fusefs, OID_AUTO, enforce_dev_perms, CTLFLAG_RW,
	&fuse_enforce_dev_perms, 0,
	"enforce fuse device permissions for secondary mounts");
	-static unsigned sync_unmount = 1;

	-SYSCTL_UINT(_vfs_fusefs, OID_AUTO, sync_unmount, CTLFLAG_RW,
	- &sync_unmount, 0, "specify when to use synchronous unmount");
	-
	MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer");

	static int
	@@ -208,11 +210,90 @@
	vfs_flagopt(opts, "__" #fnam, &__mntopts, fval); \
	} while (0)

	-SDT_PROBE_DEFINE1(fuse, , vfsops, mntopts, "uint64_t");
	-SDT_PROBE_DEFINE4(fuse, , vfsops, mount_err, "char", "struct fuse_data",
	+SDT_PROBE_DEFINE1(fusefs, , vfsops, mntopts, "uint64_t");
	+SDT_PROBE_DEFINE4(fusefs, , vfsops, mount_err, "char", "struct fuse_data",
	"struct mount*", "int");

	static int
	+fuse_vfs_remount(struct mount mp, struct thread td, uint64_t mntopts,
	+ uint32_t max_read, int daemon_timeout)
	+{
	+ int err = 0;
	+ struct fuse_data *data = fuse_get_mpdata(mp);
	+ /* Don't allow these options to be changed */
	+ const static unsigned long long cant_update_opts =
	+ MNT_USER; /* Mount owner must be the user running the daemon */
	+
	+ FUSE_LOCK();
	+
	+ if ((mp->mnt_flag ^ data->mnt_flag) & cant_update_opts) {
	+ err = EOPNOTSUPP;
	+ SDT_PROBE4(fusefs, , vfsops, mount_err,
	+ "Can't change these mount options during remount",
	+ data, mp, err);
	+ goto out;
	+ }
	+ if (((data->dataflags ^ mntopts) & FSESS_MNTOPTS_MASK) \|\|
	+ (data->max_read != max_read) \|\|
	+ (data->daemon_timeout != daemon_timeout)) {
	+ // TODO: allow changing options where it makes sense
	+ err = EOPNOTSUPP;
	+ SDT_PROBE4(fusefs, , vfsops, mount_err,
	+ "Can't change fuse mount options during remount",
	+ data, mp, err);
	+ goto out;
	+ }
	+
	+ if (fdata_get_dead(data)) {
	+ err = ENOTCONN;
	+ SDT_PROBE4(fusefs, , vfsops, mount_err,
	+ "device is dead during mount", data, mp, err);
	+ goto out;
	+ }
	+
	+ /* Sanity + permission checks */
	+ if (!data->daemoncred)
	+ panic("fuse daemon found, but identity unknown");
	+ if (mntopts & FSESS_DAEMON_CAN_SPY)
	+ err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER);
	+ if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid)
	+ /* are we allowed to do the first mount? */
	+ err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER);
	+
	+out:
	+ FUSE_UNLOCK();
	+ return err;
	+}
	+
	+static int
	+fuse_vfsop_fhtovp(struct mount mp, struct fid fhp, int flags,
	+ struct vnode **vpp)
	+{
	+ struct fuse_fid ffhp = (struct fuse_fid )fhp;
	+ struct fuse_vnode_data *fvdat;
	+ struct vnode *nvp;
	+ int error;
	+
	+ if (!(fuse_get_mpdata(mp)->dataflags & FSESS_EXPORT_SUPPORT))
	+ return EOPNOTSUPP;
	+
	+ error = VFS_VGET(mp, ffhp->nid, LK_EXCLUSIVE, &nvp);
	+ if (error) {
	+ *vpp = NULLVP;
	+ return (error);
	+ }
	+ fvdat = VTOFUD(nvp);
	+ if (fvdat->generation != ffhp->gen ) {
	+ vput(nvp);
	+ *vpp = NULLVP;
	+ return (ESTALE);
	+ }
	+ *vpp = nvp;
	+ vnode_create_vobject(*vpp, 0, curthread);
	+ return (0);
	+}
	+
	+static int
	fuse_vfsop_mount(struct mount *mp)
	{
	int err;
	@@ -238,13 +319,6 @@
	__mntopts = 0;
	td = curthread;

	- if (mp->mnt_flag & MNT_UPDATE)
	- return EOPNOTSUPP;
	-
	- MNT_ILOCK(mp);
	- mp->mnt_flag \|= MNT_SYNCHRONOUS;
	- mp->mnt_data = NULL;
	- MNT_IUNLOCK(mp);
	/* Get the new options passed to mount */
	opts = mp->mnt_optnew;

	@@ -255,19 +329,6 @@
	if (!vfs_getopts(opts, "fspath", &err))
	return err;

	- /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */
	- fspec = vfs_getopts(opts, "from", &err);
	- if (!fspec)
	- return err;
	-
	- /* `fd' contains the filedescriptor for this session; REQUIRED */
	- if (vfs_scanopt(opts, "fd", "%d", &fd) != 1)
	- return EINVAL;
	-
	- err = fuse_getdevice(fspec, td, &fdev);
	- if (err != 0)
	- return err;
	-
	/*
	* With the help of underscored options the mount program
	* can inform us from the flags it sets by default
	@@ -275,12 +336,7 @@
	FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY);
	FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN);
	FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS);
	- FUSE_FLAGOPT(no_attrcache, FSESS_NO_ATTRCACHE);
	- FUSE_FLAGOPT(no_readahed, FSESS_NO_READAHEAD);
	- FUSE_FLAGOPT(no_datacache, FSESS_NO_DATACACHE);
	- FUSE_FLAGOPT(no_namecache, FSESS_NO_NAMECACHE);
	- FUSE_FLAGOPT(no_mmap, FSESS_NO_MMAP);
	- FUSE_FLAGOPT(brokenio, FSESS_BROKENIO);
	+ FUSE_FLAGOPT(intr, FSESS_INTR);

	(void)vfs_scanopt(opts, "max_read=", "%u", &max_read);
	if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) {
	@@ -293,11 +349,29 @@
	}
	subtype = vfs_getopts(opts, "subtype=", &err);

	- SDT_PROBE1(fuse, , vfsops, mntopts, mntopts);
	+ SDT_PROBE1(fusefs, , vfsops, mntopts, mntopts);

	+ if (mp->mnt_flag & MNT_UPDATE) {
	+ return fuse_vfs_remount(mp, td, mntopts, max_read,
	+ daemon_timeout);
	+ }
	+
	+ /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */
	+ fspec = vfs_getopts(opts, "from", &err);
	+ if (!fspec)
	+ return err;
	+
	+ /* `fd' contains the filedescriptor for this session; REQUIRED */
	+ if (vfs_scanopt(opts, "fd", "%d", &fd) != 1)
	+ return EINVAL;
	+
	+ err = fuse_getdevice(fspec, td, &fdev);
	+ if (err != 0)
	+ return err;
	+
	err = fget(td, fd, &cap_read_rights, &fp);
	if (err != 0) {
	- SDT_PROBE2(fuse, , vfsops, trace, 1,
	+ SDT_PROBE2(fusefs, , vfsops, trace, 1,
	"invalid or not opened device");
	goto out;
	}
	@@ -307,16 +381,17 @@
	td->td_fpop = fptmp;
	fdrop(fp, td);
	FUSE_LOCK();
	- if (err != 0 \|\| data == NULL \|\| data->mp != NULL) {
	+
	+ if (err != 0 \|\| data == NULL) {
	err = ENXIO;
	- SDT_PROBE4(fuse, , vfsops, mount_err,
	+ SDT_PROBE4(fusefs, , vfsops, mount_err,
	"invalid or not opened device", data, mp, err);
	FUSE_UNLOCK();
	goto out;
	}
	if (fdata_get_dead(data)) {
	err = ENOTCONN;
	- SDT_PROBE4(fuse, , vfsops, mount_err,
	+ SDT_PROBE4(fusefs, , vfsops, mount_err,
	"device is dead during mount", data, mp, err);
	FUSE_UNLOCK();
	goto out;
	@@ -338,12 +413,17 @@
	data->dataflags \|= mntopts;
	data->max_read = max_read;
	data->daemon_timeout = daemon_timeout;
	+ data->mnt_flag = mp->mnt_flag & MNT_UPDATEMASK;
	FUSE_UNLOCK();

	vfs_getnewfsid(mp);
	MNT_ILOCK(mp);
	mp->mnt_data = data;
	- mp->mnt_flag \|= MNT_LOCAL;
	+ /*
	+ * FUSE file systems can be either local or remote, but the kernel
	+ * can't tell the difference.
	+ */
	+ mp->mnt_flag &= ~MNT_LOCAL;
	mp->mnt_kern_flag \|= MNTK_USES_BCACHE;
	MNT_IUNLOCK(mp);
	/* We need this here as this slot is used by getnewvnode() */
	@@ -354,6 +434,7 @@
	}
	copystr(fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &len);
	bzero(mp->mnt_stat.f_mntfromname + len, MNAMELEN - len);
	+ mp->mnt_iosize_max = MAXPHYS;

	/* Now handshaking with daemon */
	fuse_internal_send_init(data, td);
	@@ -366,9 +447,10 @@
	* Destroy device only if we acquired reference to
	* it
	*/
	- SDT_PROBE4(fuse, , vfsops, mount_err,
	+ SDT_PROBE4(fusefs, , vfsops, mount_err,
	"mount failed, destroy device", data, mp, err);
	data->mp = NULL;
	+ mp->mnt_data = NULL;
	fdata_trydestroy(data);
	}
	FUSE_UNLOCK();
	@@ -412,11 +494,13 @@
	if (fdata_get_dead(data)) {
	goto alreadydead;
	}
	- fdisp_init(&fdi, 0);
	- fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL);
	+ if (fsess_isimpl(mp, FUSE_DESTROY)) {
	+ fdisp_init(&fdi, 0);
	+ fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL);

	- err = fdisp_wait_answ(&fdi);
	- fdisp_destroy(&fdi);
	+ (void)fdisp_wait_answ(&fdi);
	+ fdisp_destroy(&fdi);
	+ }

	fdata_set_dead(data);

	@@ -429,7 +513,6 @@

	MNT_ILOCK(mp);
	mp->mnt_data = NULL;
	- mp->mnt_flag &= ~MNT_LOCAL;
	MNT_IUNLOCK(mp);

	dev_rel(fdev);
	@@ -437,7 +520,87 @@
	return 0;
	}

	+SDT_PROBE_DEFINE1(fusefs, , vfsops, invalidate_without_export,
	+ "struct mount*");
	static int
	+fuse_vfsop_vget(struct mount mp, ino_t ino, int flags, struct vnode *vpp)
	+{
	+ struct fuse_data *data = fuse_get_mpdata(mp);
	+ uint64_t nodeid = ino;
	+ struct thread *td = curthread;
	+ struct fuse_dispatcher fdi;
	+ struct fuse_entry_out *feo;
	+ struct fuse_vnode_data *fvdat;
	+ const char dot[] = ".";
	+ off_t filesize;
	+ enum vtype vtyp;
	+ int error;
	+
	+ if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) {
	+ /*
	+ * Unreachable unless you do something stupid, like export a
	+ * nullfs mount of a fusefs file system.
	+ */
	+ SDT_PROBE1(fusefs, , vfsops, invalidate_without_export, mp);
	+ return (EOPNOTSUPP);
	+ }
	+
	+ error = fuse_internal_get_cached_vnode(mp, ino, flags, vpp);
	+ if (error \|\| *vpp != NULL)
	+ return error;
	+
	+ /* Do a LOOKUP, using nodeid as the parent and "." as filename */
	+ fdisp_init(&fdi, sizeof(dot));
	+ fdisp_make(&fdi, FUSE_LOOKUP, mp, nodeid, td, td->td_ucred);
	+ memcpy(fdi.indata, dot, sizeof(dot));
	+ error = fdisp_wait_answ(&fdi);
	+
	+ if (error)
	+ return error;
	+
	+ feo = (struct fuse_entry_out *)fdi.answ;
	+ if (feo->nodeid == 0) {
	+ /* zero nodeid means ENOENT and cache it */
	+ error = ENOENT;
	+ goto out;
	+ }
	+
	+ vtyp = IFTOVT(feo->attr.mode);
	+ error = fuse_vnode_get(mp, feo, nodeid, NULL, vpp, NULL, vtyp);
	+ if (error)
	+ goto out;
	+ filesize = feo->attr.size;
	+
	+ /*
	+ * In the case where we are looking up a FUSE node represented by an
	+ * existing cached vnode, and the true size reported by FUSE_LOOKUP
	+ * doesn't match the vnode's cached size, then any cached writes beyond
	+ * the file's current size are lost.
	+ *
	+ * We can get here:
	+ * * following attribute cache expiration, or
	+ * * due a bug in the daemon, or
	+ */
	+ fvdat = VTOFUD(*vpp);
	+ if (vnode_isreg(*vpp) &&
	+ filesize != fvdat->cached_attrs.va_size &&
	+ fvdat->flag & FN_SIZECHANGE) {
	+ printf("%s: WB cache incoherent on %s!\n", __func__,
	+ vnode_mount(*vpp)->mnt_stat.f_mntonname);
	+
	+ fvdat->flag &= ~FN_SIZECHANGE;
	+ }
	+
	+ fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
	+ feo->attr_valid_nsec, NULL);
	+ fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec,
	+ &fvdat->entry_cache_timeout);
	+out:
	+ fdisp_destroy(&fdi);
	+ return error;
	+}
	+
	+static int
	fuse_vfsop_root(struct mount mp, int lkflags, struct vnode *vpp)
	{
	struct fuse_data *data = fuse_get_mpdata(mp);
	@@ -454,13 +617,13 @@
	FUSE_LOCK();
	MPASS(data->vroot == NULL \|\| data->vroot == *vpp);
	if (data->vroot == NULL) {
	- SDT_PROBE2(fuse, , vfsops, trace, 1,
	+ SDT_PROBE2(fusefs, , vfsops, trace, 1,
	"new root vnode");
	data->vroot = *vpp;
	FUSE_UNLOCK();
	vref(*vpp);
	} else if (data->vroot != *vpp) {
	- SDT_PROBE2(fuse, , vfsops, trace, 1,
	+ SDT_PROBE2(fusefs, , vfsops, trace, 1,
	"root vnode race");
	FUSE_UNLOCK();
	VOP_UNLOCK(*vpp, 0);
	@@ -523,7 +686,7 @@
	sbp->f_files = 0;
	sbp->f_ffree = 0;
	sbp->f_namemax = 0;
	- sbp->f_bsize = FUSE_DEFAULT_BLOCKSIZE;
	+ sbp->f_bsize = S_BLKSIZE;

	return 0;
	}
	Index: sys/fs/fuse/fuse_vnops.c
	===================================================================
	--- sys/fs/fuse/fuse_vnops.c
	+++ sys/fs/fuse/fuse_vnops.c
	@@ -33,6 +33,11 @@
	* Copyright (C) 2005 Csaba Henk.
	* All rights reserved.
	*
	+ * Copyright (c) 2019 The FreeBSD Foundation
	+ *
	+ * Portions of this software were developed by BFF Storage Systems, LLC under
	+ * sponsorship from the FreeBSD Foundation.
	+ *
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	@@ -102,24 +107,30 @@
	#include "fuse_internal.h"
	#include "fuse_ipc.h"
	#include "fuse_node.h"
	-#include "fuse_param.h"
	#include "fuse_io.h"

	#include <sys/priv.h>

	-SDT_PROVIDER_DECLARE(fuse);
	+/* Maximum number of hardlinks to a single FUSE file */
	+#define FUSE_LINK_MAX UINT32_MAX
	+
	+SDT_PROVIDER_DECLARE(fusefs);
	/*
	* Fuse trace probe:
	* arg0: verbosity. Higher numbers give more verbose messages
	* arg1: Textual message
	*/
	-SDT_PROBE_DEFINE2(fuse, , vnops, trace, "int", "char*");
	+SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*");

	/* vnode ops */
	static vop_access_t fuse_vnop_access;
	+static vop_advlock_t fuse_vnop_advlock;
	+static vop_bmap_t fuse_vnop_bmap;
	+static vop_close_t fuse_fifo_close;
	static vop_close_t fuse_vnop_close;
	static vop_create_t fuse_vnop_create;
	static vop_deleteextattr_t fuse_vnop_deleteextattr;
	+static vop_fdatasync_t fuse_vnop_fdatasync;
	static vop_fsync_t fuse_vnop_fsync;
	static vop_getattr_t fuse_vnop_getattr;
	static vop_getextattr_t fuse_vnop_getextattr;
	@@ -144,19 +155,44 @@
	static vop_symlink_t fuse_vnop_symlink;
	static vop_write_t fuse_vnop_write;
	static vop_getpages_t fuse_vnop_getpages;
	-static vop_putpages_t fuse_vnop_putpages;
	static vop_print_t fuse_vnop_print;
	+static vop_vptofh_t fuse_vnop_vptofh;

	+struct vop_vector fuse_fifoops = {
	+ .vop_default = &fifo_specops,
	+ .vop_access = fuse_vnop_access,
	+ .vop_close = fuse_fifo_close,
	+ .vop_fsync = fuse_vnop_fsync,
	+ .vop_getattr = fuse_vnop_getattr,
	+ .vop_inactive = fuse_vnop_inactive,
	+ .vop_pathconf = fuse_vnop_pathconf,
	+ .vop_print = fuse_vnop_print,
	+ .vop_read = VOP_PANIC,
	+ .vop_reclaim = fuse_vnop_reclaim,
	+ .vop_setattr = fuse_vnop_setattr,
	+ .vop_write = VOP_PANIC,
	+ .vop_vptofh = fuse_vnop_vptofh,
	+};
	+
	struct vop_vector fuse_vnops = {
	+ .vop_allocate = VOP_EINVAL,
	.vop_default = &default_vnodeops,
	.vop_access = fuse_vnop_access,
	+ .vop_advlock = fuse_vnop_advlock,
	+ .vop_bmap = fuse_vnop_bmap,
	.vop_close = fuse_vnop_close,
	.vop_create = fuse_vnop_create,
	.vop_deleteextattr = fuse_vnop_deleteextattr,
	.vop_fsync = fuse_vnop_fsync,
	+ .vop_fdatasync = fuse_vnop_fdatasync,
	.vop_getattr = fuse_vnop_getattr,
	.vop_getextattr = fuse_vnop_getextattr,
	.vop_inactive = fuse_vnop_inactive,
	+ /*
	+ * TODO: implement vop_ioctl after upgrading to protocol 7.16.
	+ * FUSE_IOCTL was added in 7.11, but 32-bit compat is broken until
	+ * 7.16.
	+ */
	.vop_link = fuse_vnop_link,
	.vop_listextattr = fuse_vnop_listextattr,
	.vop_lookup = fuse_vnop_lookup,
	@@ -164,6 +200,12 @@
	.vop_mknod = fuse_vnop_mknod,
	.vop_open = fuse_vnop_open,
	.vop_pathconf = fuse_vnop_pathconf,
	+ /*
	+ * TODO: implement vop_poll after upgrading to protocol 7.21.
	+ * FUSE_POLL was added in protocol 7.11, but it's kind of broken until
	+ * 7.21, which adds the ability for the client to choose which poll
	+ * events it wants, and for a client to deregister a file handle
	+ */
	.vop_read = fuse_vnop_read,
	.vop_readdir = fuse_vnop_readdir,
	.vop_readlink = fuse_vnop_readlink,
	@@ -177,41 +219,103 @@
	.vop_symlink = fuse_vnop_symlink,
	.vop_write = fuse_vnop_write,
	.vop_getpages = fuse_vnop_getpages,
	- .vop_putpages = fuse_vnop_putpages,
	.vop_print = fuse_vnop_print,
	+ .vop_vptofh = fuse_vnop_vptofh,
	};

	-static u_long fuse_lookup_cache_hits = 0;
	+uma_zone_t fuse_pbuf_zone;

	-SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_hits, CTLFLAG_RD,
	- &fuse_lookup_cache_hits, 0, "number of positive cache hits in lookup");
	+#define fuse_vm_page_lock(m) vm_page_lock((m));
	+#define fuse_vm_page_unlock(m) vm_page_unlock((m));
	+#define fuse_vm_page_lock_queues() ((void)0)
	+#define fuse_vm_page_unlock_queues() ((void)0)

	-static u_long fuse_lookup_cache_misses = 0;
	+/* Check permission for extattr operations, much like extattr_check_cred */
	+static int
	+fuse_extattr_check_cred(struct vnode vp, int ns, struct ucred cred,
	+ struct thread *td, accmode_t accmode)
	+{
	+ struct mount *mp = vnode_mount(vp);
	+ struct fuse_data *data = fuse_get_mpdata(mp);

	-SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_misses, CTLFLAG_RD,
	- &fuse_lookup_cache_misses, 0, "number of cache misses in lookup");
	+ /*
	+ * Kernel-invoked always succeeds.
	+ */
	+ if (cred == NOCRED)
	+ return (0);

	-int fuse_lookup_cache_enable = 1;
	+ /*
	+ * Do not allow privileged processes in jail to directly manipulate
	+ * system attributes.
	+ */
	+ switch (ns) {
	+ case EXTATTR_NAMESPACE_SYSTEM:
	+ if (data->dataflags & FSESS_DEFAULT_PERMISSIONS) {
	+ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
	+ }
	+ /* FALLTHROUGH */
	+ case EXTATTR_NAMESPACE_USER:
	+ return (fuse_internal_access(vp, accmode, td, cred));
	+ default:
	+ return (EPERM);
	+ }
	+}

	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, lookup_cache_enable, CTLFLAG_RW,
	- &fuse_lookup_cache_enable, 0, "if non-zero, enable lookup cache");
	+/* Get a filehandle for a directory */
	+static int
	+fuse_filehandle_get_dir(struct vnode vp, struct fuse_filehandle *fufhp,
	+ struct ucred *cred, pid_t pid)
	+{
	+ if (fuse_filehandle_get(vp, FREAD, fufhp, cred, pid) == 0)
	+ return 0;
	+ return fuse_filehandle_get(vp, FEXEC, fufhp, cred, pid);
	+}

	-/*
	- * XXX: This feature is highly experimental and can bring to instabilities,
	- * needs revisiting before to be enabled by default.
	- */
	-static int fuse_reclaim_revoked = 0;
	+/* Send FUSE_FLUSH for this vnode */
	+static int
	+fuse_flush(struct vnode vp, struct ucred cred, pid_t pid, int fflag)
	+{
	+ struct fuse_flush_in *ffi;
	+ struct fuse_filehandle *fufh;
	+ struct fuse_dispatcher fdi;
	+ struct thread *td = curthread;
	+ struct mount *mp = vnode_mount(vp);
	+ int err;

	-SYSCTL_INT(_vfs_fusefs, OID_AUTO, reclaim_revoked, CTLFLAG_RW,
	- &fuse_reclaim_revoked, 0, "");
	+ if (!fsess_isimpl(vnode_mount(vp), FUSE_FLUSH))
	+ return 0;

	-uma_zone_t fuse_pbuf_zone;
	+ err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
	+ if (err)
	+ return err;

	-#define fuse_vm_page_lock(m) vm_page_lock((m));
	-#define fuse_vm_page_unlock(m) vm_page_unlock((m));
	-#define fuse_vm_page_lock_queues() ((void)0)
	-#define fuse_vm_page_unlock_queues() ((void)0)
	+ fdisp_init(&fdi, sizeof(*ffi));
	+ fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred);
	+ ffi = fdi.indata;
	+ ffi->fh = fufh->fh_id;
	+ /*
	+ * If the file has a POSIX lock then we're supposed to set lock_owner.
	+ * If not, then lock_owner is undefined. So we may as well always set
	+ * it.
	+ */
	+ ffi->lock_owner = td->td_proc->p_pid;

	+ err = fdisp_wait_answ(&fdi);
	+ if (err == ENOSYS) {
	+ fsess_set_notimpl(mp, FUSE_FLUSH);
	+ err = 0;
	+ }
	+ fdisp_destroy(&fdi);
	+ return err;
	+}
	+
	+/* Close wrapper for fifos. */
	+static int
	+fuse_fifo_close(struct vop_close_args *ap)
	+{
	+ return (fifo_specops.vop_close(ap));
	+}
	+
	/*
	struct vnop_access_args {
	struct vnode *a_vp;
	@@ -231,7 +335,6 @@
	int accmode = ap->a_accmode;
	struct ucred *cred = ap->a_cred;

	- struct fuse_access_param facp;
	struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));

	int err;
	@@ -254,15 +357,192 @@
	if (vnode_islnk(vp)) {
	return 0;
	}
	- bzero(&facp, sizeof(facp));

	- err = fuse_internal_access(vp, accmode, &facp, ap->a_td, ap->a_cred);
	+ err = fuse_internal_access(vp, accmode, ap->a_td, ap->a_cred);
	return err;
	}

	/*
	- struct vnop_close_args {
	+ * struct vop_advlock_args {
	+ * struct vop_generic_args a_gen;
	+ * struct vnode *a_vp;
	+ * void *a_id;
	+ * int a_op;
	+ * struct flock *a_fl;
	+ * int a_flags;
	+ * }
	+ */
	+static int
	+fuse_vnop_advlock(struct vop_advlock_args *ap)
	+{
	+ struct vnode *vp = ap->a_vp;
	+ struct flock *fl = ap->a_fl;
	+ struct thread *td = curthread;
	+ struct ucred *cred = td->td_ucred;
	+ pid_t pid = td->td_proc->p_pid;
	+ struct fuse_filehandle *fufh;
	+ struct fuse_dispatcher fdi;
	+ struct fuse_lk_in *fli;
	+ struct fuse_lk_out *flo;
	+ enum fuse_opcode op;
	+ int dataflags, err;
	+ int flags = ap->a_flags;
	+
	+ dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;
	+
	+ if (fuse_isdeadfs(vp)) {
	+ return ENXIO;
	+ }
	+
	+ if (!(dataflags & FSESS_POSIX_LOCKS))
	+ return vop_stdadvlock(ap);
	+ /* FUSE doesn't properly support flock until protocol 7.17 */
	+ if (flags & F_FLOCK)
	+ return vop_stdadvlock(ap);
	+
	+ err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid);
	+ if (err)
	+ return err;
	+
	+ fdisp_init(&fdi, sizeof(*fli));
	+
	+ switch(ap->a_op) {
	+ case F_GETLK:
	+ op = FUSE_GETLK;
	+ break;
	+ case F_SETLK:
	+ op = FUSE_SETLK;
	+ break;
	+ case F_SETLKW:
	+ op = FUSE_SETLKW;
	+ break;
	+ default:
	+ return EINVAL;
	+ }
	+
	+ fdisp_make_vp(&fdi, op, vp, td, cred);
	+ fli = fdi.indata;
	+ fli->fh = fufh->fh_id;
	+ fli->owner = fl->l_pid;
	+ fli->lk.start = fl->l_start;
	+ if (fl->l_len != 0)
	+ fli->lk.end = fl->l_start + fl->l_len - 1;
	+ else
	+ fli->lk.end = INT64_MAX;
	+ fli->lk.type = fl->l_type;
	+ fli->lk.pid = fl->l_pid;
	+
	+ err = fdisp_wait_answ(&fdi);
	+ fdisp_destroy(&fdi);
	+
	+ if (err == 0 && op == FUSE_GETLK) {
	+ flo = fdi.answ;
	+ fl->l_type = flo->lk.type;
	+ fl->l_pid = flo->lk.pid;
	+ if (flo->lk.type != F_UNLCK) {
	+ fl->l_start = flo->lk.start;
	+ if (flo->lk.end == INT64_MAX)
	+ fl->l_len = 0;
	+ else
	+ fl->l_len = flo->lk.end - flo->lk.start + 1;
	+ fl->l_start = flo->lk.start;
	+ }
	+ }
	+
	+ return err;
	+}
	+
	+/* {
	struct vnode *a_vp;
	+ daddr_t a_bn;
	+ struct bufobj **a_bop;
	+ daddr_t *a_bnp;
	+ int *a_runp;
	+ int *a_runb;
	+} */
	+static int
	+fuse_vnop_bmap(struct vop_bmap_args *ap)
	+{
	+ struct vnode *vp = ap->a_vp;
	+ struct bufobj **bo = ap->a_bop;
	+ struct thread *td = curthread;
	+ struct mount *mp;
	+ struct fuse_dispatcher fdi;
	+ struct fuse_bmap_in *fbi;
	+ struct fuse_bmap_out *fbo;
	+ struct fuse_data *data;
	+ uint64_t biosize;
	+ off_t filesize;
	+ daddr_t lbn = ap->a_bn;
	+ daddr_t *pbn = ap->a_bnp;
	+ int *runp = ap->a_runp;
	+ int *runb = ap->a_runb;
	+ int error = 0;
	+ int maxrun;
	+
	+ if (fuse_isdeadfs(vp)) {
	+ return ENXIO;
	+ }
	+
	+ mp = vnode_mount(vp);
	+ data = fuse_get_mpdata(mp);
	+ biosize = fuse_iosize(vp);
	+ maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1,
	+ data->max_readahead_blocks);
	+
	+ if (bo != NULL)
	+ *bo = &vp->v_bufobj;
	+
	+ /*
	+ * The FUSE_BMAP operation does not include the runp and runb
	+ * variables, so we must guess. Report nonzero contiguous runs so
	+ * cluster_read will combine adjacent reads. It's worthwhile to reduce
	+ * upcalls even if we don't know the true physical layout of the file.
	+ *
	+ * FUSE file systems may opt out of read clustering in two ways:
	+ * * mounting with -onoclusterr
	+ * * Setting max_readahead <= maxbcachebuf during FUSE_INIT
	+ */
	+ if (runb != NULL)
	+ *runb = MIN(lbn, maxrun);
	+ if (runp != NULL) {
	+ error = fuse_vnode_size(vp, &filesize, td->td_ucred, td);
	+ if (error == 0)
	+ *runp = MIN(MAX(0, filesize / biosize - lbn - 1),
	+ maxrun);
	+ else
	+ *runp = 0;
	+ }
	+
	+ if (fsess_isimpl(mp, FUSE_BMAP)) {
	+ fdisp_init(&fdi, sizeof(*fbi));
	+ fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred);
	+ fbi = fdi.indata;
	+ fbi->block = lbn;
	+ fbi->blocksize = biosize;
	+ error = fdisp_wait_answ(&fdi);
	+ if (error == ENOSYS) {
	+ fdisp_destroy(&fdi);
	+ fsess_set_notimpl(mp, FUSE_BMAP);
	+ error = 0;
	+ } else {
	+ fbo = fdi.answ;
	+ if (error == 0 && pbn != NULL)
	+ *pbn = fbo->block;
	+ fdisp_destroy(&fdi);
	+ return error;
	+ }
	+ }
	+
	+ /* If the daemon doesn't support BMAP, make up a sensible default */
	+ if (pbn != NULL)
	+ pbn = lbn btodb(biosize);
	+ return (error);
	+}
	+
	+/*
	+ struct vop_close_args {
	+ struct vnode *a_vp;
	int a_fflag;
	struct ucred *a_cred;
	struct thread *a_td;
	@@ -274,39 +554,48 @@
	struct vnode *vp = ap->a_vp;
	struct ucred *cred = ap->a_cred;
	int fflag = ap->a_fflag;
	- fufh_type_t fufh_type;
	+ struct thread *td = ap->a_td;
	+ pid_t pid = td->td_proc->p_pid;
	+ int err = 0;

	- if (fuse_isdeadfs(vp)) {
	+ if (fuse_isdeadfs(vp))
	return 0;
	- }
	- if (vnode_isdir(vp)) {
	- if (fuse_filehandle_valid(vp, FUFH_RDONLY)) {
	- fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred);
	- }
	+ if (vnode_isdir(vp))
	return 0;
	- }
	- if (fflag & IO_NDELAY) {
	+ if (fflag & IO_NDELAY)
	return 0;
	- }
	- fufh_type = fuse_filehandle_xlate_from_fflags(fflag);

	- if (!fuse_filehandle_valid(vp, fufh_type)) {
	- int i;
	-
	- for (i = 0; i < FUFH_MAXTYPE; i++)
	- if (fuse_filehandle_valid(vp, i))
	- break;
	- if (i == FUFH_MAXTYPE)
	- panic("FUSE: fufh type %d found to be invalid in close"
	- " (fflag=0x%x)\n",
	- fufh_type, fflag);
	- }
	+ err = fuse_flush(vp, cred, pid, fflag);
	+ /* TODO: close the file handle, if we're sure it's no longer used */
	if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
	- fuse_vnode_savesize(vp, cred);
	+ fuse_vnode_savesize(vp, cred, td->td_proc->p_pid);
	}
	- return 0;
	+ return err;
	}

	+static void
	+fdisp_make_mknod_for_fallback(
	+ struct fuse_dispatcher *fdip,
	+ struct componentname *cnp,
	+ struct vnode *dvp,
	+ uint64_t parentnid,
	+ struct thread *td,
	+ struct ucred *cred,
	+ mode_t mode,
	+ enum fuse_opcode *op)
	+{
	+ struct fuse_mknod_in *fmni;
	+
	+ fdisp_init(fdip, sizeof(*fmni) + cnp->cn_namelen + 1);
	+ *op = FUSE_MKNOD;
	+ fdisp_make(fdip, *op, vnode_mount(dvp), parentnid, td, cred);
	+ fmni = fdip->indata;
	+ fmni->mode = mode;
	+ fmni->rdev = 0;
	+ memcpy((char )fdip->indata + sizeof(fmni), cnp->cn_nameptr,
	+ cnp->cn_namelen);
	+ ((char )fdip->indata)[sizeof(fmni) + cnp->cn_namelen] = '\0';
	+}
	/*
	struct vnop_create_args {
	struct vnode *a_dvp;
	@@ -325,107 +614,169 @@
	struct thread *td = cnp->cn_thread;
	struct ucred *cred = cnp->cn_cred;

	- struct fuse_open_in *foi;
	+ struct fuse_data *data;
	+ struct fuse_create_in *fci;
	struct fuse_entry_out *feo;
	- struct fuse_dispatcher fdi;
	+ struct fuse_open_out *foo;
	+ struct fuse_dispatcher fdi, fdi2;
	struct fuse_dispatcher *fdip = &fdi;
	+ struct fuse_dispatcher *fdip2 = NULL;

	int err;

	struct mount *mp = vnode_mount(dvp);
	+ data = fuse_get_mpdata(mp);
	uint64_t parentnid = VTOFUD(dvp)->nid;
	mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode);
	- uint64_t x_fh_id;
	- uint32_t x_open_flags;
	+ enum fuse_opcode op;
	+ int flags;

	- if (fuse_isdeadfs(dvp)) {
	+ if (fuse_isdeadfs(dvp))
	return ENXIO;
	- }
	+
	+ /* FUSE expects sockets to be created with FUSE_MKNOD */
	+ if (vap->va_type == VSOCK)
	+ return fuse_internal_mknod(dvp, vpp, cnp, vap);
	+
	+ /*
	+ * VOP_CREATE doesn't tell us the open(2) flags, so we guess. Only a
	+ * writable mode makes sense, and we might as well include readability
	+ * too.
	+ */
	+ flags = O_RDWR;
	+
	bzero(&fdi, sizeof(fdi));

	- /* XXX: Will we ever want devices ? */
	- if ((vap->va_type != VREG)) {
	- printf("fuse_vnop_create: unsupported va_type %d\n",
	- vap->va_type);
	+ if (vap->va_type != VREG)
	return (EINVAL);
	- }

	- fdisp_init(fdip, sizeof(*foi) + cnp->cn_namelen + 1);
	- if (!fsess_isimpl(mp, FUSE_CREATE)) {
	- SDT_PROBE2(fuse, , vnops, trace, 1,
	- "eh, daemon doesn't implement create?");
	- return (EINVAL);
	- }
	- fdisp_make(fdip, FUSE_CREATE, vnode_mount(dvp), parentnid, td, cred);
	+ if (!fsess_isimpl(mp, FUSE_CREATE) \|\| vap->va_type == VSOCK) {
	+ /* Fallback to FUSE_MKNOD/FUSE_OPEN */
	+ fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td,
	+ cred, mode, &op);
	+ } else {
	+ /* Use FUSE_CREATE */
	+ size_t insize;

	- foi = fdip->indata;
	- foi->mode = mode;
	- foi->flags = O_CREAT \| O_RDWR;
	+ op = FUSE_CREATE;
	+ fdisp_init(fdip, sizeof(*fci) + cnp->cn_namelen + 1);
	+ fdisp_make(fdip, op, vnode_mount(dvp), parentnid, td, cred);
	+ fci = fdip->indata;
	+ fci->mode = mode;
	+ fci->flags = O_CREAT \| flags;
	+ if (fuse_libabi_geq(data, 7, 12)) {
	+ insize = sizeof(*fci);
	+ fci->umask = td->td_proc->p_fd->fd_cmask;
	+ } else {
	+ insize = sizeof(struct fuse_open_in);
	+ }

	- memcpy((char )fdip->indata + sizeof(foi), cnp->cn_nameptr,
	- cnp->cn_namelen);
	- ((char )fdip->indata)[sizeof(foi) + cnp->cn_namelen] = '\0';
	+ memcpy((char *)fdip->indata + insize, cnp->cn_nameptr,
	+ cnp->cn_namelen);
	+ ((char *)fdip->indata)[insize + cnp->cn_namelen] = '\0';
	+ }

	err = fdisp_wait_answ(fdip);

	if (err) {
	- if (err == ENOSYS)
	+ if (err == ENOSYS && op == FUSE_CREATE) {
	fsess_set_notimpl(mp, FUSE_CREATE);
	- goto out;
	+ fdisp_destroy(fdip);
	+ fdisp_make_mknod_for_fallback(fdip, cnp, dvp,
	+ parentnid, td, cred, mode, &op);
	+ err = fdisp_wait_answ(fdip);
	+ }
	+ if (err)
	+ goto out;
	}

	feo = fdip->answ;

	- if ((err = fuse_internal_checkentry(feo, VREG))) {
	+ if ((err = fuse_internal_checkentry(feo, vap->va_type))) {
	goto out;
	}
	- err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, VREG);
	+
	+ if (op == FUSE_CREATE) {
	+ foo = (struct fuse_open_out*)(feo + 1);
	+ } else {
	+ /* Issue a separate FUSE_OPEN */
	+ struct fuse_open_in *foi;
	+
	+ fdip2 = &fdi2;
	+ fdisp_init(fdip2, sizeof(*foi));
	+ fdisp_make(fdip2, FUSE_OPEN, vnode_mount(dvp), feo->nodeid, td,
	+ cred);
	+ foi = fdip2->indata;
	+ foi->flags = flags;
	+ err = fdisp_wait_answ(fdip2);
	+ if (err)
	+ goto out;
	+ foo = fdip2->answ;
	+ }
	+ err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vap->va_type);
	if (err) {
	struct fuse_release_in *fri;
	uint64_t nodeid = feo->nodeid;
	- uint64_t fh_id = ((struct fuse_open_out *)(feo + 1))->fh;
	+ uint64_t fh_id = foo->fh;

	fdisp_init(fdip, sizeof(*fri));
	fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred);
	fri = fdip->indata;
	fri->fh = fh_id;
	- fri->flags = OFLAGS(mode);
	+ fri->flags = flags;
	fuse_insert_callback(fdip->tick, fuse_internal_forget_callback);
	- fuse_insert_message(fdip->tick);
	- return err;
	+ fuse_insert_message(fdip->tick, false);
	+ goto out;
	}
	ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create");
	+ fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
	+ feo->attr_valid_nsec, NULL);

	- fdip->answ = feo + 1;
	-
	- x_fh_id = ((struct fuse_open_out *)(feo + 1))->fh;
	- x_open_flags = ((struct fuse_open_out *)(feo + 1))->open_flags;
	- fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, x_fh_id);
	- fuse_vnode_open(*vpp, x_open_flags, td);
	+ fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, td, cred, foo);
	+ fuse_vnode_open(*vpp, foo->open_flags, td);
	+ /*
	+ * Purge the parent's attribute cache because the daemon should've
	+ * updated its mtime and ctime
	+ */
	+ fuse_vnode_clear_attr_cache(dvp);
	cache_purge_negative(dvp);

	out:
	+ if (fdip2)
	+ fdisp_destroy(fdip2);
	fdisp_destroy(fdip);
	return err;
	}

	/*
	- * Our vnop_fsync roughly corresponds to the FUSE_FSYNC method. The Linux
	- * version of FUSE also has a FUSE_FLUSH method.
	- *
	- * On Linux, fsync() synchronizes a file's complete in-core state with that
	- * on disk. The call is not supposed to return until the system has completed
	- * that action or until an error is detected.
	- *
	- * Linux also has an fdatasync() call that is similar to fsync() but is not
	- * required to update the metadata such as access time and modification time.
	- */
	+ struct vnop_fdatasync_args {
	+ struct vop_generic_args a_gen;
	+ struct vnode * a_vp;
	+ struct thread * a_td;
	+ };
	+*/
	+static int
	+fuse_vnop_fdatasync(struct vop_fdatasync_args *ap)
	+{
	+ struct vnode *vp = ap->a_vp;
	+ struct thread *td = ap->a_td;
	+ int waitfor = MNT_WAIT;

	+ int err = 0;
	+
	+ if (fuse_isdeadfs(vp)) {
	+ return 0;
	+ }
	+ if ((err = vop_stdfdatasync_buf(ap)))
	+ return err;
	+
	+ return fuse_internal_fsync(vp, td, waitfor, true);
	+}
	+
	/*
	struct vnop_fsync_args {
	- struct vnodeop_desc *a_desc;
	+ struct vop_generic_args a_gen;
	struct vnode * a_vp;
	- struct ucred * a_cred;
	int a_waitfor;
	struct thread * a_td;
	};
	@@ -435,31 +786,16 @@
	{
	struct vnode *vp = ap->a_vp;
	struct thread *td = ap->a_td;
	+ int waitfor = ap->a_waitfor;
	+ int err = 0;

	- struct fuse_filehandle *fufh;
	- struct fuse_vnode_data *fvdat = VTOFUD(vp);
	-
	- int type, err = 0;
	-
	if (fuse_isdeadfs(vp)) {
	return 0;
	}
	if ((err = vop_stdfsync(ap)))
	return err;

	- if (!fsess_isimpl(vnode_mount(vp),
	- (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) {
	- goto out;
	- }
	- for (type = 0; type < FUFH_MAXTYPE; type++) {
	- fufh = &(fvdat->fufh[type]);
	- if (FUFH_IS_VALID(fufh)) {
	- fuse_internal_fsync(vp, td, NULL, fufh);
	- }
	- }
	-
	-out:
	- return 0;
	+ return fuse_internal_fsync(vp, td, waitfor, false);
	}

	/*
	@@ -477,12 +813,9 @@
	struct vattr *vap = ap->a_vap;
	struct ucred *cred = ap->a_cred;
	struct thread *td = curthread;
	- struct fuse_vnode_data *fvdat = VTOFUD(vp);
	- struct fuse_attr_out *fao;

	int err = 0;
	int dataflags;
	- struct fuse_dispatcher fdi;

	dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;

	@@ -497,48 +830,14 @@
	goto fake;
	}
	}
	- fdisp_init(&fdi, 0);
	- if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) {
	- if ((err == ENOTCONN) && vnode_isvroot(vp)) {
	- /* see comment in fuse_vfsop_statfs() */
	- fdisp_destroy(&fdi);
	- goto fake;
	- }
	- if (err == ENOENT) {
	- fuse_internal_vnode_disappear(vp);
	- }
	- goto out;
	+ err = fuse_internal_getattr(vp, vap, cred, td);
	+ if (err == ENOTCONN && vnode_isvroot(vp)) {
	+ /* see comment in fuse_vfsop_statfs() */
	+ goto fake;
	+ } else {
	+ return err;
	}

	- fao = (struct fuse_attr_out *)fdi.answ;
	- fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
	- fao->attr_valid_nsec, vap);
	- if (vap->va_type != vnode_vtype(vp)) {
	- fuse_internal_vnode_disappear(vp);
	- err = ENOENT;
	- goto out;
	- }
	- if ((fvdat->flag & FN_SIZECHANGE) != 0)
	- vap->va_size = fvdat->filesize;
	-
	- if (vnode_isreg(vp) && (fvdat->flag & FN_SIZECHANGE) == 0) {
	- /*
	- * This is for those cases when the file size changed without us
	- * knowing, and we want to catch up.
	- */
	- off_t new_filesize = ((struct fuse_attr_out *)
	- fdi.answ)->attr.size;
	-
	- if (fvdat->filesize != new_filesize) {
	- fuse_vnode_setsize(vp, new_filesize);
	- fvdat->flag &= ~FN_SIZECHANGE;
	- }
	- }
	-
	-out:
	- fdisp_destroy(&fdi);
	- return err;
	-
	fake:
	bzero(vap, sizeof(*vap));
	vap->va_type = vnode_vtype(vp);
	@@ -559,31 +858,27 @@
	struct thread *td = ap->a_td;

	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	- struct fuse_filehandle *fufh = NULL;
	+ struct fuse_filehandle fufh, fufh_tmp;

	- int type, need_flush = 1;
	+ int need_flush = 1;

	- for (type = 0; type < FUFH_MAXTYPE; type++) {
	- fufh = &(fvdat->fufh[type]);
	- if (FUFH_IS_VALID(fufh)) {
	- if (need_flush && vp->v_type == VREG) {
	- if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
	- fuse_vnode_savesize(vp, NULL);
	- }
	- if (fuse_data_cache_invalidate \|\|
	- (fvdat->flag & FN_REVOKED) != 0)
	- fuse_io_invalbuf(vp, td);
	- else
	- fuse_io_flushbuf(vp, MNT_WAIT, td);
	- need_flush = 0;
	+ LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) {
	+ if (need_flush && vp->v_type == VREG) {
	+ if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
	+ fuse_vnode_savesize(vp, NULL, 0);
	}
	- fuse_filehandle_close(vp, type, td, NULL);
	+ if ((fvdat->flag & FN_REVOKED) != 0)
	+ fuse_io_invalbuf(vp, td);
	+ else
	+ fuse_io_flushbuf(vp, MNT_WAIT, td);
	+ need_flush = 0;
	}
	+ fuse_filehandle_close(vp, fufh, td, NULL);
	}

	- if ((fvdat->flag & FN_REVOKED) != 0 && fuse_reclaim_revoked) {
	+ if ((fvdat->flag & FN_REVOKED) != 0)
	vrecycle(vp);
	- }
	+
	return 0;
	}

	@@ -635,11 +930,39 @@
	feo = fdi.answ;

	err = fuse_internal_checkentry(feo, vnode_vtype(vp));
	+ if (!err) {
	+ /*
	+ * Purge the parent's attribute cache because the daemon
	+ * should've updated its mtime and ctime
	+ */
	+ fuse_vnode_clear_attr_cache(tdvp);
	+ fuse_internal_cache_attrs(vp, &feo->attr, feo->attr_valid,
	+ feo->attr_valid_nsec, NULL);
	+ }
	out:
	fdisp_destroy(&fdi);
	return err;
	}

	+struct fuse_lookup_alloc_arg {
	+ struct fuse_entry_out *feo;
	+ struct componentname *cnp;
	+ uint64_t nid;
	+ enum vtype vtyp;
	+};
	+
	+/* Callback for vn_get_ino */
	+static int
	+fuse_lookup_alloc(struct mount mp, void arg, int lkflags, struct vnode **vpp)
	+{
	+ struct fuse_lookup_alloc_arg *flaa = arg;
	+
	+ return fuse_vnode_get(mp, flaa->feo, flaa->nid, NULL, vpp, flaa->cnp,
	+ flaa->vtyp);
	+}
	+
	+SDT_PROBE_DEFINE3(fusefs, , vnops, cache_lookup,
	+ "int", "struct timespec", "struct timespec");
	/*
	struct vnop_lookup_args {
	struct vnodeop_desc *a_desc;
	@@ -668,268 +991,146 @@
	struct vnode *vp = NULL;

	struct fuse_dispatcher fdi;
	- enum fuse_opcode op;
	+ bool did_lookup = false;
	+ struct fuse_entry_out *feo = NULL;
	+ enum vtype vtyp; /* vnode type of target */
	+ off_t filesize; /* filesize of target */

	uint64_t nid;
	- struct fuse_access_param facp;

	if (fuse_isdeadfs(dvp)) {
	*vpp = NULL;
	return ENXIO;
	}
	- if (!vnode_isdir(dvp)) {
	+ if (!vnode_isdir(dvp))
	return ENOTDIR;
	- }
	- if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) {
	+
	+ if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP))
	return EROFS;
	- }
	- /*
	- * We do access check prior to doing anything else only in the case
	- * when we are at fs root (we'd like to say, "we are at the first
	- * component", but that's not exactly the same... nevermind).
	- * See further comments at further access checks.
	- */

	- bzero(&facp, sizeof(facp));
	- if (vnode_isvroot(dvp)) { /* early permission check hack */
	- if ((err = fuse_internal_access(dvp, VEXEC, &facp, td, cred))) {
	- return err;
	- }
	- }
	+ if ((err = fuse_internal_access(dvp, VEXEC, td, cred)))
	+ return err;
	+
	if (flags & ISDOTDOT) {
	+ KASSERT(VTOFUD(dvp)->flag & FN_PARENT_NID,
	+ ("Looking up .. is TODO"));
	nid = VTOFUD(dvp)->parent_nid;
	- if (nid == 0) {
	+ if (nid == 0)
	return ENOENT;
	- }
	- fdisp_init(&fdi, 0);
	- op = FUSE_GETATTR;
	- goto calldaemon;
	+ /* .. is obviously a directory */
	+ vtyp = VDIR;
	+ filesize = 0;
	} else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') {
	nid = VTOI(dvp);
	- fdisp_init(&fdi, 0);
	- op = FUSE_GETATTR;
	- goto calldaemon;
	- } else if (fuse_lookup_cache_enable) {
	- err = cache_lookup(dvp, vpp, cnp, NULL, NULL);
	- switch (err) {
	+ /* . is obviously a directory */
	+ vtyp = VDIR;
	+ filesize = 0;
	+ } else {
	+ struct timespec now, timeout;

	+ err = cache_lookup(dvp, vpp, cnp, &timeout, NULL);
	+ getnanouptime(&now);
	+ SDT_PROBE3(fusefs, , vnops, cache_lookup, err, &timeout, &now);
	+ switch (err) {
	case -1: /* positive match */
	- atomic_add_acq_long(&fuse_lookup_cache_hits, 1);
	+ if (timespeccmp(&timeout, &now, >)) {
	+ counter_u64_add(fuse_lookup_cache_hits, 1);
	+ } else {
	+ /* Cache timeout */
	+ counter_u64_add(fuse_lookup_cache_misses, 1);
	+ bintime_clear(
	+ &VTOFUD(*vpp)->entry_cache_timeout);
	+ cache_purge(*vpp);
	+ if (dvp != *vpp)
	+ vput(*vpp);
	+ else
	+ vrele(*vpp);
	+ *vpp = NULL;
	+ break;
	+ }
	return 0;

	case 0: /* no match in cache */
	- atomic_add_acq_long(&fuse_lookup_cache_misses, 1);
	+ counter_u64_add(fuse_lookup_cache_misses, 1);
	break;

	case ENOENT: /* negative match */
	+ getnanouptime(&now);
	+ if (timespeccmp(&timeout, &now, <=)) {
	+ /* Cache timeout */
	+ cache_purge_negative(dvp);
	+ break;
	+ }
	/* fall through */
	default:
	return err;
	}
	- }
	- nid = VTOI(dvp);
	- fdisp_init(&fdi, cnp->cn_namelen + 1);
	- op = FUSE_LOOKUP;

	-calldaemon:
	- fdisp_make(&fdi, op, mp, nid, td, cred);
	+ nid = VTOI(dvp);
	+ fdisp_init(&fdi, cnp->cn_namelen + 1);
	+ fdisp_make(&fdi, FUSE_LOOKUP, mp, nid, td, cred);

	- if (op == FUSE_LOOKUP) {
	memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
	((char *)fdi.indata)[cnp->cn_namelen] = '\0';
	- }
	- lookup_err = fdisp_wait_answ(&fdi);
	+ lookup_err = fdisp_wait_answ(&fdi);
	+ did_lookup = true;

	- if ((op == FUSE_LOOKUP) && !lookup_err) { /* lookup call succeeded */
	- nid = ((struct fuse_entry_out *)fdi.answ)->nodeid;
	- if (!nid) {
	- /*
	- * zero nodeid is the same as "not found",
	- * but it's also cacheable (which we keep
	- * keep on doing not as of writing this)
	- */
	- lookup_err = ENOENT;
	- } else if (nid == FUSE_ROOT_ID) {
	- lookup_err = EINVAL;
	+ if (!lookup_err) {
	+ /* lookup call succeeded */
	+ feo = (struct fuse_entry_out *)fdi.answ;
	+ nid = feo->nodeid;
	+ if (nid == 0) {
	+ /* zero nodeid means ENOENT and cache it */
	+ struct timespec timeout;
	+
	+ fdi.answ_stat = ENOENT;
	+ lookup_err = ENOENT;
	+ if (cnp->cn_flags & MAKEENTRY) {
	+ fuse_validity_2_timespec(feo, &timeout);
	+ cache_enter_time(dvp, *vpp, cnp,
	+ &timeout, NULL);
	+ }
	+ } else if (nid == FUSE_ROOT_ID) {
	+ lookup_err = EINVAL;
	+ }
	+ vtyp = IFTOVT(feo->attr.mode);
	+ filesize = feo->attr.size;
	}
	+ if (lookup_err && (!fdi.answ_stat \|\| lookup_err != ENOENT)) {
	+ fdisp_destroy(&fdi);
	+ return lookup_err;
	+ }
	}
	- if (lookup_err &&
	- (!fdi.answ_stat \|\| lookup_err != ENOENT \|\| op != FUSE_LOOKUP)) {
	- fdisp_destroy(&fdi);
	- return lookup_err;
	- }
	/* lookup_err, if non-zero, must be ENOENT at this point */

	if (lookup_err) {
	+ /* Entry not found */
	+ if ((nameiop == CREATE \|\| nameiop == RENAME) && islastcn) {
	+ err = fuse_internal_access(dvp, VWRITE, td, cred);
	+ if (!err) {
	+ /*
	+ * Set the SAVENAME flag to hold onto the
	+ * pathname for use later in VOP_CREATE or
	+ * VOP_RENAME.
	+ */
	+ cnp->cn_flags \|= SAVENAME;

	- if ((nameiop == CREATE \|\| nameiop == RENAME) && islastcn
	- /* && directory dvp has not been removed */ ) {
	-
	- if (vfs_isrdonly(mp)) {
	- err = EROFS;
	- goto out;
	+ err = EJUSTRETURN;
	}
	-#if 0 /* THINK_ABOUT_THIS */
	- if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) {
	- goto out;
	- }
	-#endif
	-
	- /*
	- * Possibly record the position of a slot in the
	- * directory large enough for the new component name.
	- * This can be recorded in the vnode private data for
	- * dvp. Set the SAVENAME flag to hold onto the
	- * pathname for use later in VOP_CREATE or VOP_RENAME.
	- */
	- cnp->cn_flags \|= SAVENAME;
	-
	- err = EJUSTRETURN;
	- goto out;
	- }
	- /* Consider inserting name into cache. */
	-
	- /*
	- * No we can't use negative caching, as the fs
	- * changes are out of our control.
	- * False positives' falseness turns out just as things
	- * go by, but false negatives' falseness doesn't.
	- * (and aiding the caching mechanism with extra control
	- * mechanisms comes quite close to beating the whole purpose
	- * caching...)
	- */
	-#if 0
	- if ((cnp->cn_flags & MAKEENTRY) != 0) {
	- SDT_PROBE2(fuse, , vnops, trace, 1,
	- "inserting NULL into cache");
	- cache_enter(dvp, NULL, cnp);
	- }
	-#endif
	- err = ENOENT;
	- goto out;
	-
	- } else {
	-
	- /* !lookup_err */
	-
	- struct fuse_entry_out *feo = NULL;
	- struct fuse_attr *fattr = NULL;
	-
	- if (op == FUSE_GETATTR) {
	- fattr = &((struct fuse_attr_out *)fdi.answ)->attr;
	} else {
	- feo = (struct fuse_entry_out *)fdi.answ;
	- fattr = &(feo->attr);
	+ err = ENOENT;
	}
	-
	- /*
	- * If deleting, and at end of pathname, return parameters
	- * which can be used to remove file. If the wantparent flag
	- * isn't set, we return only the directory, otherwise we go on
	- * and lock the inode, being careful with ".".
	- */
	- if (nameiop == DELETE && islastcn) {
	- /*
	- * Check for write access on directory.
	- */
	- facp.xuid = fattr->uid;
	- facp.facc_flags \|= FACCESS_STICKY;
	- err = fuse_internal_access(dvp, VWRITE, &facp, td, cred);
	- facp.facc_flags &= ~FACCESS_XQUERIES;
	-
	- if (err) {
	- goto out;
	- }
	- if (nid == VTOI(dvp)) {
	- vref(dvp);
	- *vpp = dvp;
	- } else {
	- err = fuse_vnode_get(dvp->v_mount, feo, nid,
	- dvp, &vp, cnp, IFTOVT(fattr->mode));
	- if (err)
	- goto out;
	- *vpp = vp;
	- }
	-
	- /*
	- * Save the name for use in VOP_RMDIR and VOP_REMOVE
	- * later.
	- */
	- cnp->cn_flags \|= SAVENAME;
	- goto out;
	-
	- }
	- /*
	- * If rewriting (RENAME), return the inode and the
	- * information required to rewrite the present directory
	- * Must get inode of directory entry to verify it's a
	- * regular file, or empty directory.
	- */
	- if (nameiop == RENAME && wantparent && islastcn) {
	-
	-#if 0 /* THINK_ABOUT_THIS */
	- if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) {
	- goto out;
	- }
	-#endif
	-
	- /*
	- * Check for "."
	- */
	- if (nid == VTOI(dvp)) {
	- err = EISDIR;
	- goto out;
	- }
	- err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp,
	- &vp, cnp, IFTOVT(fattr->mode));
	- if (err) {
	- goto out;
	- }
	- *vpp = vp;
	- /*
	- * Save the name for use in VOP_RENAME later.
	- */
	- cnp->cn_flags \|= SAVENAME;
	-
	- goto out;
	- }
	+ } else {
	+ /* Entry was found */
	if (flags & ISDOTDOT) {
	- struct mount *mp;
	- int ltype;
	+ struct fuse_lookup_alloc_arg flaa;

	- /*
	- * Expanded copy of vn_vget_ino() so that
	- * fuse_vnode_get() can be used.
	- */
	- mp = dvp->v_mount;
	- ltype = VOP_ISLOCKED(dvp);
	- err = vfs_busy(mp, MBF_NOWAIT);
	- if (err != 0) {
	- vfs_ref(mp);
	- VOP_UNLOCK(dvp, 0);
	- err = vfs_busy(mp, 0);
	- vn_lock(dvp, ltype \| LK_RETRY);
	- vfs_rel(mp);
	- if (err)
	- goto out;
	- if ((dvp->v_iflag & VI_DOOMED) != 0) {
	- err = ENOENT;
	- vfs_unbusy(mp);
	- goto out;
	- }
	- }
	- VOP_UNLOCK(dvp, 0);
	- err = fuse_vnode_get(vnode_mount(dvp), feo, nid, NULL,
	- &vp, cnp, IFTOVT(fattr->mode));
	- vfs_unbusy(mp);
	- vn_lock(dvp, ltype \| LK_RETRY);
	- if ((dvp->v_iflag & VI_DOOMED) != 0) {
	- if (err == 0)
	- vput(vp);
	- err = ENOENT;
	- }
	- if (err)
	- goto out;
	+ flaa.nid = nid;
	+ flaa.feo = feo;
	+ flaa.cnp = cnp;
	+ flaa.vtyp = vtyp;
	+ err = vn_vget_ino_gen(dvp, fuse_lookup_alloc, &flaa, 0,
	+ &vp);
	*vpp = vp;
	} else if (nid == VTOI(dvp)) {
	vref(dvp);
	@@ -938,25 +1139,26 @@
	struct fuse_vnode_data *fvdat;

	err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp,
	- &vp, cnp, IFTOVT(fattr->mode));
	- if (err) {
	+ &vp, cnp, vtyp);
	+ if (err)
	goto out;
	- }
	- fuse_vnode_setparent(vp, dvp);
	+ *vpp = vp;

	/*
	* In the case where we are looking up a FUSE node
	* represented by an existing cached vnode, and the
	* true size reported by FUSE_LOOKUP doesn't match
	- * the vnode's cached size, fix the vnode cache to
	- * match the real object size.
	+ * the vnode's cached size, then any cached writes
	+ * beyond the file's current size are lost.
	*
	- * This can occur via FUSE distributed filesystems,
	- * irregular files, etc.
	+ * We can get here:
	+ * * following attribute cache expiration, or
	+ * * due a bug in the daemon, or
	*/
	fvdat = VTOFUD(vp);
	if (vnode_isreg(vp) &&
	- fattr->size != fvdat->filesize) {
	+ filesize != fvdat->cached_attrs.va_size &&
	+ fvdat->flag & FN_SIZECHANGE) {
	/*
	* The FN_SIZECHANGE flag reflects a dirty
	* append. If userspace lets us know our cache
	@@ -966,131 +1168,64 @@
	*
	* XXX: Maybe disable WB caching on this mount.
	*/
	- if (fvdat->flag & FN_SIZECHANGE)
	- printf("%s: WB cache incoherent on "
	- "%s!\n", __func__,
	- vnode_mount(vp)->mnt_stat.f_mntonname);
	+ printf("%s: WB cache incoherent on %s!\n",
	+ __func__,
	+ vnode_mount(vp)->mnt_stat.f_mntonname);

	- (void)fuse_vnode_setsize(vp, fattr->size);
	fvdat->flag &= ~FN_SIZECHANGE;
	}
	- *vpp = vp;
	- }

	- if (op == FUSE_GETATTR) {
	- struct fuse_attr_out *fao =
	- (struct fuse_attr_out*)fdi.answ;
	- fuse_internal_cache_attrs(*vpp,
	- &fao->attr, fao->attr_valid,
	- fao->attr_valid_nsec, NULL);
	- } else {
	- struct fuse_entry_out *feo =
	- (struct fuse_entry_out*)fdi.answ;
	- fuse_internal_cache_attrs(*vpp,
	- &feo->attr, feo->attr_valid,
	- feo->attr_valid_nsec, NULL);
	- }
	+ MPASS(feo != NULL);
	+ fuse_internal_cache_attrs(*vpp, &feo->attr,
	+ feo->attr_valid, feo->attr_valid_nsec, NULL);
	+ fuse_validity_2_bintime(feo->entry_valid,
	+ feo->entry_valid_nsec,
	+ &fvdat->entry_cache_timeout);

	- /* Insert name into cache if appropriate. */
	+ if ((nameiop == DELETE \|\| nameiop == RENAME) &&
	+ islastcn)
	+ {
	+ struct vattr dvattr;

	- /*
	- * Nooo, caching is evil. With caching, we can't avoid stale
	- * information taking over the playground (cached info is not
	- * just positive/negative, it does have qualitative aspects,
	- * too). And a (VOP/FUSE)_GETATTR is always thrown anyway, when
	- * walking down along cached path components, and that's not
	- * any cheaper than FUSE_LOOKUP. This might change with
	- * implementing kernel side attr caching, but... In Linux,
	- * lookup results are not cached, and the daemon is bombarded
	- * with FUSE_LOOKUPS on and on. This shows that by design, the
	- * daemon is expected to handle frequent lookup queries
	- * efficiently, do its caching in userspace, and so on.
	- *
	- * So just leave the name cache alone.
	- */
	-
	- /*
	- * Well, now I know, Linux caches lookups, but with a
	- * timeout... So it's the same thing as attribute caching:
	- * we can deal with it when implement timeouts.
	- */
	-#if 0
	- if (cnp->cn_flags & MAKEENTRY) {
	- cache_enter(dvp, *vpp, cnp);
	- }
	-#endif
	- }
	-out:
	- if (!lookup_err) {
	-
	- /* No lookup error; need to clean up. */
	-
	- if (err) { /* Found inode; exit with no vnode. */
	- if (op == FUSE_LOOKUP) {
	- fuse_internal_forget_send(vnode_mount(dvp), td, cred,
	- nid, 1);
	- }
	- fdisp_destroy(&fdi);
	- return err;
	- } else {
	-#ifndef NO_EARLY_PERM_CHECK_HACK
	- if (!islastcn) {
	- /*
	- * We have the attributes of the next item
	- * now, and it's a fact, and we do not
	- * have to do extra work for it (ie, beg the
	- * daemon), and it neither depends on such
	- * accidental things like attr caching. So
	- * the big idea: check credentials now,
	- * not at the beginning of the next call to
	- * lookup.
	- *
	- * The first item of the lookup chain (fs root)
	- * won't be checked then here, of course, as
	- * its never "the next". But go and see that
	- * the root is taken care about at the very
	- * beginning of this function.
	- *
	- * Now, given we want to do the access check
	- * this way, one might ask: so then why not
	- * do the access check just after fetching
	- * the inode and its attributes from the
	- * daemon? Why bother with producing the
	- * corresponding vnode at all if something
	- * is not OK? We know what's the deal as
	- * soon as we get those attrs... There is
	- * one bit of info though not given us by
	- * the daemon: whether his response is
	- * authoritative or not... His response should
	- * be ignored if something is mounted over
	- * the dir in question. But that can be
	- * known only by having the vnode...
	+ err = fuse_internal_access(dvp, VWRITE, td,
	+ cred);
	+ if (err != 0)
	+ goto out;
	+ /*
	+ * if the parent's sticky bit is set, check
	+ * whether we're allowed to remove the file.
	+ * Need to figure out the vnode locking to make
	+ * this work.
	*/
	- int tmpvtype = vnode_vtype(*vpp);
	-
	- bzero(&facp, sizeof(facp));
	- /the early perm check hack /
	- facp.facc_flags \|= FACCESS_VA_VALID;
	-
	- if ((tmpvtype != VDIR) && (tmpvtype != VLNK)) {
	- err = ENOTDIR;
	+ fuse_internal_getattr(dvp, &dvattr, cred, td);
	+ if ((dvattr.va_mode & S_ISTXT) &&
	+ fuse_internal_access(dvp, VADMIN, td,
	+ cred) &&
	+ fuse_internal_access(*vpp, VADMIN, td,
	+ cred)) {
	+ err = EPERM;
	+ goto out;
	}
	- if (!err && !vnode_mountedhere(*vpp)) {
	- err = fuse_internal_access(*vpp, VEXEC, &facp, td, cred);
	- }
	- if (err) {
	- if (tmpvtype == VLNK)
	- SDT_PROBE2(fuse, , vnops, trace,
	- 1, "weird, permission "
	- "error with a symlink?");
	- vput(*vpp);
	- *vpp = NULL;
	- }
	}
	-#endif
	+
	+ if (islastcn && (
	+ (nameiop == DELETE) \|\|
	+ (nameiop == RENAME && wantparent))) {
	+ cnp->cn_flags \|= SAVENAME;
	+ }
	+
	}
	}
	- fdisp_destroy(&fdi);
	+out:
	+ if (err) {
	+ if (vp != NULL && dvp != vp)
	+ vput(vp);
	+ else if (vp != NULL)
	+ vrele(vp);
	+ *vpp = NULL;
	+ }
	+ if (did_lookup)
	+ fdisp_destroy(&fdi);

	return err;
	}
	@@ -1117,6 +1252,7 @@
	return ENXIO;
	}
	fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode);
	+ fmdi.umask = curthread->td_proc->p_fd->fd_cmask;

	return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi,
	sizeof(fmdi), VDIR));
	@@ -1134,12 +1270,19 @@
	fuse_vnop_mknod(struct vop_mknod_args *ap)
	{

	- return (EINVAL);
	-}
	+ struct vnode *dvp = ap->a_dvp;
	+ struct vnode **vpp = ap->a_vpp;
	+ struct componentname *cnp = ap->a_cnp;
	+ struct vattr *vap = ap->a_vap;

	+ if (fuse_isdeadfs(dvp))
	+ return ENXIO;

	+ return fuse_internal_mknod(dvp, vpp, cnp, vap);
	+}
	+
	/*
	- struct vnop_open_args {
	+ struct vop_open_args {
	struct vnode *a_vp;
	int a_mode;
	struct ucred *a_cred;
	@@ -1151,50 +1294,27 @@
	fuse_vnop_open(struct vop_open_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	- int mode = ap->a_mode;
	+ int a_mode = ap->a_mode;
	struct thread *td = ap->a_td;
	struct ucred *cred = ap->a_cred;
	-
	- fufh_type_t fufh_type;
	+ pid_t pid = td->td_proc->p_pid;
	struct fuse_vnode_data *fvdat;

	- int error, isdir = 0;
	- int32_t fuse_open_flags;
	-
	- if (fuse_isdeadfs(vp)) {
	+ if (fuse_isdeadfs(vp))
	return ENXIO;
	- }
	- if ((mode & (FREAD \| FWRITE)) == 0)
	+ if (vp->v_type == VCHR \|\| vp->v_type == VBLK \|\| vp->v_type == VFIFO)
	+ return (EOPNOTSUPP);
	+ if ((a_mode & (FREAD \| FWRITE \| FEXEC)) == 0)
	return EINVAL;

	fvdat = VTOFUD(vp);

	- if (vnode_isdir(vp)) {
	- isdir = 1;
	- }
	- fuse_open_flags = 0;
	- if (isdir) {
	- fufh_type = FUFH_RDONLY;
	- } else {
	- fufh_type = fuse_filehandle_xlate_from_fflags(mode);
	- /*
	- * For WRONLY opens, force DIRECT_IO. This is necessary
	- * since writing a partial block through the buffer cache
	- * will result in a read of the block and that read won't
	- * be allowed by the WRONLY open.
	- */
	- if (fufh_type == FUFH_WRONLY \|\|
	- (fvdat->flag & FN_DIRECTIO) != 0)
	- fuse_open_flags = FOPEN_DIRECT_IO;
	- }
	-
	- if (fuse_filehandle_validrw(vp, fufh_type) != FUFH_INVALID) {
	- fuse_vnode_open(vp, fuse_open_flags, td);
	+ if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) {
	+ fuse_vnode_open(vp, 0, td);
	return 0;
	}
	- error = fuse_filehandle_open(vp, fufh_type, NULL, td, cred);

	- return error;
	+ return fuse_filehandle_open(vp, a_mode, NULL, td, cred);
	}

	static int
	@@ -1237,6 +1357,7 @@
	struct uio *uio = ap->a_uio;
	int ioflag = ap->a_ioflag;
	struct ucred *cred = ap->a_cred;
	+ pid_t pid = curthread->td_proc->p_pid;

	if (fuse_isdeadfs(vp)) {
	return ENXIO;
	@@ -1246,7 +1367,7 @@
	ioflag \|= IO_DIRECT;
	}

	- return fuse_io_dispatch(vp, uio, ioflag, cred);
	+ return fuse_io_dispatch(vp, uio, ioflag, cred, pid);
	}

	/*
	@@ -1255,7 +1376,7 @@
	struct uio *a_uio;
	struct ucred *a_cred;
	int *a_eofflag;
	- int *ncookies;
	+ int *a_ncookies;
	u_long **a_cookies;
	};
	*/
	@@ -1265,13 +1386,18 @@
	struct vnode *vp = ap->a_vp;
	struct uio *uio = ap->a_uio;
	struct ucred *cred = ap->a_cred;
	-
	struct fuse_filehandle *fufh = NULL;
	struct fuse_iov cookediov;
	-
	int err = 0;
	- int freefufh = 0;
	+ u_long *cookies;
	+ off_t startoff;
	+ ssize_t tresid;
	+ int ncookies;
	+ bool closefufh = false;
	+ pid_t pid = curthread->td_proc->p_pid;

	+ if (ap->a_eofflag)
	+ *ap->a_eofflag = 0;
	if (fuse_isdeadfs(vp)) {
	return ENXIO;
	}
	@@ -1280,26 +1406,61 @@
	return EINVAL;
	}

	- if (!fuse_filehandle_valid(vp, FUFH_RDONLY)) {
	- SDT_PROBE2(fuse, , vnops, trace, 1,
	- "calling readdir() before open()");
	- err = fuse_filehandle_open(vp, FUFH_RDONLY, &fufh, NULL, cred);
	- freefufh = 1;
	- } else {
	- err = fuse_filehandle_get(vp, FUFH_RDONLY, &fufh);
	+ tresid = uio->uio_resid;
	+ startoff = uio->uio_offset;
	+ err = fuse_filehandle_get_dir(vp, &fufh, cred, pid);
	+ if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) {
	+ /*
	+ * nfsd will do VOP_READDIR without first doing VOP_OPEN. We
	+ * must implicitly open the directory here
	+ */
	+ err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred);
	+ if (err == 0) {
	+ /*
	+ * When a directory is opened, it must be read from
	+ * the beginning. Hopefully, the "startoff" still
	+ * exists as an offset cookie for the directory.
	+ * If not, it will read the entire directory without
	+ * returning any entries and just return eof.
	+ */
	+ uio->uio_offset = 0;
	+ }
	+ closefufh = true;
	}
	- if (err) {
	+ if (err)
	return (err);
	+ if (ap->a_ncookies != NULL) {
	+ ncookies = uio->uio_resid /
	+ (offsetof(struct dirent, d_name) + 4) + 1;
	+ cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK);
	+ *ap->a_ncookies = ncookies;
	+ *ap->a_cookies = cookies;
	+ } else {
	+ ncookies = 0;
	+ cookies = NULL;
	}
	#define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1)
	fiov_init(&cookediov, DIRCOOKEDSIZE);

	- err = fuse_internal_readdir(vp, uio, fufh, &cookediov);
	+ err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov,
	+ &ncookies, cookies);

	fiov_teardown(&cookediov);
	- if (freefufh) {
	- fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred);
	+ if (closefufh)
	+ fuse_filehandle_close(vp, fufh, curthread, cred);
	+
	+ if (ap->a_ncookies != NULL) {
	+ if (err == 0) {
	+ *ap->a_ncookies -= ncookies;
	+ } else {
	+ free(*ap->a_cookies, M_TEMP);
	+ *ap->a_ncookies = 0;
	+ *ap->a_cookies = NULL;
	+ }
	}
	+ if (err == 0 && tresid == uio->uio_resid)
	+ *ap->a_eofflag = 1;
	+
	return err;
	}

	@@ -1356,22 +1517,16 @@
	{
	struct vnode *vp = ap->a_vp;
	struct thread *td = ap->a_td;
	-
	struct fuse_vnode_data *fvdat = VTOFUD(vp);
	- struct fuse_filehandle *fufh = NULL;
	+ struct fuse_filehandle fufh, fufh_tmp;

	- int type;
	-
	if (!fvdat) {
	panic("FUSE: no vnode data during recycling");
	}
	- for (type = 0; type < FUFH_MAXTYPE; type++) {
	- fufh = &(fvdat->fufh[type]);
	- if (FUFH_IS_VALID(fufh)) {
	- printf("FUSE: vnode being reclaimed but fufh (type=%d) is valid",
	- type);
	- fuse_filehandle_close(vp, type, td, NULL);
	- }
	+ LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) {
	+ printf("FUSE: vnode being reclaimed with open fufh "
	+ "(type=%#x)", fufh->fufh_type);
	+ fuse_filehandle_close(vp, fufh, td, NULL);
	}

	if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) {
	@@ -1409,12 +1564,9 @@
	if (vnode_isdir(vp)) {
	return EPERM;
	}
	- cache_purge(vp);

	err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK);

	- if (err == 0)
	- fuse_internal_vnode_disappear(vp);
	return err;
	}

	@@ -1438,7 +1590,8 @@
	struct vnode *tvp = ap->a_tvp;
	struct componentname *tcnp = ap->a_tcnp;
	struct fuse_data *data;
	-
	+ bool newparent = fdvp != tdvp;
	+ bool isdir = fvp->v_type == VDIR;
	int err = 0;

	if (fuse_isdeadfs(fdvp)) {
	@@ -1446,7 +1599,7 @@
	}
	if (fvp->v_mount != tdvp->v_mount \|\|
	(tvp && fvp->v_mount != tvp->v_mount)) {
	- SDT_PROBE2(fuse, , vnops, trace, 1, "cross-device rename");
	+ SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename");
	err = EXDEV;
	goto out;
	}
	@@ -1457,7 +1610,17 @@
	* under the source directory in the file system tree.
	* Linux performs this check at VFS level.
	*/
	+ /*
	+ * If source is a directory, and it will get a new parent, user must
	+ * have write permission to it, so ".." can be modified.
	+ */
	data = fuse_get_mpdata(vnode_mount(tdvp));
	+ if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) {
	+ err = fuse_internal_access(fvp, VWRITE,
	+ tcnp->cn_thread, tcnp->cn_cred);
	+ if (err)
	+ goto out;
	+ }
	sx_xlock(&data->rename_lock);
	err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp);
	if (err == 0) {
	@@ -1515,8 +1678,6 @@
	}
	err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR);

	- if (err == 0)
	- fuse_internal_vnode_disappear(vp);
	return err;
	}

	@@ -1535,129 +1696,137 @@
	struct vattr *vap = ap->a_vap;
	struct ucred *cred = ap->a_cred;
	struct thread *td = curthread;
	+ struct mount *mp;
	+ struct fuse_data *data;
	+ struct vattr old_va;
	+ int dataflags;
	+ int err = 0, err2;
	+ accmode_t accmode = 0;
	+ bool checkperm;
	+ bool drop_suid = false;
	+ gid_t cr_gid;

	- struct fuse_dispatcher fdi;
	- struct fuse_setattr_in *fsai;
	- struct fuse_access_param facp;
	+ mp = vnode_mount(vp);
	+ data = fuse_get_mpdata(mp);
	+ dataflags = data->dataflags;
	+ checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS;
	+ if (cred->cr_ngroups > 0)
	+ cr_gid = cred->cr_groups[0];
	+ else
	+ cr_gid = 0;

	- int err = 0;
	- enum vtype vtyp;
	- int sizechanged = 0;
	- uint64_t newsize = 0;
	-
	if (fuse_isdeadfs(vp)) {
	return ENXIO;
	}
	- fdisp_init(&fdi, sizeof(*fsai));
	- fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred);
	- fsai = fdi.indata;
	- fsai->valid = 0;

	- bzero(&facp, sizeof(facp));
	-
	- facp.xuid = vap->va_uid;
	- facp.xgid = vap->va_gid;
	-
	if (vap->va_uid != (uid_t)VNOVAL) {
	- facp.facc_flags \|= FACCESS_CHOWN;
	- fsai->uid = vap->va_uid;
	- fsai->valid \|= FATTR_UID;
	+ if (checkperm) {
	+ /* Only root may change a file's owner */
	+ err = priv_check_cred(cred, PRIV_VFS_CHOWN);
	+ if (err) {
	+ /* As a special case, allow the null chown */
	+ err2 = fuse_internal_getattr(vp, &old_va, cred,
	+ td);
	+ if (err2)
	+ return (err2);
	+ if (vap->va_uid != old_va.va_uid)
	+ return err;
	+ else
	+ accmode \|= VADMIN;
	+ drop_suid = true;
	+ } else
	+ accmode \|= VADMIN;
	+ } else
	+ accmode \|= VADMIN;
	}
	if (vap->va_gid != (gid_t)VNOVAL) {
	- facp.facc_flags \|= FACCESS_CHOWN;
	- fsai->gid = vap->va_gid;
	- fsai->valid \|= FATTR_GID;
	+ if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN))
	+ drop_suid = true;
	+ if (checkperm && !groupmember(vap->va_gid, cred))
	+ {
	+ /*
	+ * Non-root users may only chgrp to one of their own
	+ * groups
	+ */
	+ err = priv_check_cred(cred, PRIV_VFS_CHOWN);
	+ if (err) {
	+ /* As a special case, allow the null chgrp */
	+ err2 = fuse_internal_getattr(vp, &old_va, cred,
	+ td);
	+ if (err2)
	+ return (err2);
	+ if (vap->va_gid != old_va.va_gid)
	+ return err;
	+ accmode \|= VADMIN;
	+ } else
	+ accmode \|= VADMIN;
	+ } else
	+ accmode \|= VADMIN;
	}
	if (vap->va_size != VNOVAL) {
	-
	- struct fuse_filehandle *fufh = NULL;
	-
	- /Truncate to a new value. /
	- fsai->size = vap->va_size;
	- sizechanged = 1;
	- newsize = vap->va_size;
	- fsai->valid \|= FATTR_SIZE;
	-
	- fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh);
	- if (fufh) {
	- fsai->fh = fufh->fh_id;
	- fsai->valid \|= FATTR_FH;
	+ switch (vp->v_type) {
	+ case VDIR:
	+ return (EISDIR);
	+ case VLNK:
	+ case VREG:
	+ if (vfs_isrdonly(mp))
	+ return (EROFS);
	+ break;
	+ default:
	+ /*
	+ * According to POSIX, the result is unspecified
	+ * for file types other than regular files,
	+ * directories and shared memory objects. We
	+ * don't support shared memory objects in the file
	+ * system, and have dubious support for truncating
	+ * symlinks. Just ignore the request in other cases.
	+ */
	+ return (0);
	}
	+ /* Don't set accmode. Permission to trunc is checked upstack */
	}
	- if (vap->va_atime.tv_sec != VNOVAL) {
	- fsai->atime = vap->va_atime.tv_sec;
	- fsai->atimensec = vap->va_atime.tv_nsec;
	- fsai->valid \|= FATTR_ATIME;
	+ if (vap->va_atime.tv_sec != VNOVAL \|\| vap->va_mtime.tv_sec != VNOVAL) {
	+ if (vap->va_vaflags & VA_UTIMES_NULL)
	+ accmode \|= VWRITE;
	+ else
	+ accmode \|= VADMIN;
	}
	- if (vap->va_mtime.tv_sec != VNOVAL) {
	- fsai->mtime = vap->va_mtime.tv_sec;
	- fsai->mtimensec = vap->va_mtime.tv_nsec;
	- fsai->valid \|= FATTR_MTIME;
	+ if (drop_suid) {
	+ if (vap->va_mode != (mode_t)VNOVAL)
	+ vap->va_mode &= ~(S_ISUID \| S_ISGID);
	+ else {
	+ err = fuse_internal_getattr(vp, &old_va, cred, td);
	+ if (err)
	+ return (err);
	+ vap->va_mode = old_va.va_mode & ~(S_ISUID \| S_ISGID);
	+ }
	}
	if (vap->va_mode != (mode_t)VNOVAL) {
	- fsai->mode = vap->va_mode & ALLPERMS;
	- fsai->valid \|= FATTR_MODE;
	+ /* Only root may set the sticky bit on non-directories */
	+ if (checkperm && vp->v_type != VDIR && (vap->va_mode & S_ISTXT)
	+ && priv_check_cred(cred, PRIV_VFS_STICKYFILE))
	+ return EFTYPE;
	+ if (checkperm && (vap->va_mode & S_ISGID)) {
	+ err = fuse_internal_getattr(vp, &old_va, cred, td);
	+ if (err)
	+ return (err);
	+ if (!groupmember(old_va.va_gid, cred)) {
	+ err = priv_check_cred(cred, PRIV_VFS_SETGID);
	+ if (err)
	+ return (err);
	+ }
	+ }
	+ accmode \|= VADMIN;
	}
	- if (!fsai->valid) {
	- goto out;
	- }
	- vtyp = vnode_vtype(vp);

	- if (fsai->valid & FATTR_SIZE && vtyp == VDIR) {
	- err = EISDIR;
	- goto out;
	- }
	- if (vfs_isrdonly(vnode_mount(vp)) && (fsai->valid & ~FATTR_SIZE \|\| vtyp == VREG)) {
	- err = EROFS;
	- goto out;
	- }
	- if (fsai->valid & ~FATTR_SIZE) {
	- /err = fuse_internal_access(vp, VADMIN, context, &facp); /
	- /XXX /
	- err = 0;
	- }
	- facp.facc_flags &= ~FACCESS_XQUERIES;
	+ if (vfs_isrdonly(mp))
	+ return EROFS;

	- if (err && !(fsai->valid & ~(FATTR_ATIME \| FATTR_MTIME)) &&
	- vap->va_vaflags & VA_UTIMES_NULL) {
	- err = fuse_internal_access(vp, VWRITE, &facp, td, cred);
	- }
	+ err = fuse_internal_access(vp, accmode, td, cred);
	if (err)
	- goto out;
	- if ((err = fdisp_wait_answ(&fdi)))
	- goto out;
	- vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode);
	-
	- if (vnode_vtype(vp) != vtyp) {
	- if (vnode_vtype(vp) == VNON && vtyp != VNON) {
	- SDT_PROBE2(fuse, , vnops, trace, 1, "FUSE: Dang! "
	- "vnode_vtype is VNON and vtype isn't.");
	- } else {
	- /*
	- * STALE vnode, ditch
	- *
	- * The vnode has changed its type "behind our back".
	- * There's nothing really we can do, so let us just
	- * force an internal revocation and tell the caller to
	- * try again, if interested.
	- */
	- fuse_internal_vnode_disappear(vp);
	- err = EAGAIN;
	- }
	- }
	- if (err == 0) {
	- struct fuse_attr_out fao = (struct fuse_attr_out)fdi.answ;
	- fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
	- fao->attr_valid_nsec, NULL);
	- }
	-
	-out:
	- fdisp_destroy(&fdi);
	- if (!err && sizechanged) {
	- fuse_vnode_setsize(vp, newsize);
	- VTOFUD(vp)->flag &= ~FN_SIZECHANGE;
	- }
	- return err;
	+ return err;
	+ else
	+ return fuse_internal_setattr(vp, vap, td, cred);
	}

	/*
	@@ -1676,22 +1845,15 @@
	bp->b_ioflags \|= BIO_ERROR;
	bp->b_error = ENXIO;
	bufdone(bp);
	- return ENXIO;
	+ return 0;
	}
	- if (bp->b_iocmd == BIO_WRITE)
	- fuse_vnode_refreshsize(vp, NOCRED);

	- (void)fuse_io_strategy(vp, bp);
	-
	/*
	- * This is a dangerous function. If returns error, that might mean a
	- * panic. We prefer pretty much anything over being forced to panic
	- * by a malicious daemon (a demon?). So we just return 0 anyway. You
	- * should never mind this: this function has its own error
	- * propagation mechanism via the argument buffer, so
	- * not-that-melodramatic residents of the call chain still will be
	- * able to know what to do.
	+ * VOP_STRATEGY always returns zero and signals error via bp->b_ioflags.
	+ * fuse_io_strategy sets bp's error fields
	*/
	+ (void)fuse_io_strategy(vp, bp);
	+
	return 0;
	}

	@@ -1757,237 +1919,70 @@
	struct uio *uio = ap->a_uio;
	int ioflag = ap->a_ioflag;
	struct ucred *cred = ap->a_cred;
	+ pid_t pid = curthread->td_proc->p_pid;

	if (fuse_isdeadfs(vp)) {
	return ENXIO;
	}
	- fuse_vnode_refreshsize(vp, cred);

	if (VTOFUD(vp)->flag & FN_DIRECTIO) {
	ioflag \|= IO_DIRECT;
	}

	- return fuse_io_dispatch(vp, uio, ioflag, cred);
	+ return fuse_io_dispatch(vp, uio, ioflag, cred, pid);
	}

	-SDT_PROBE_DEFINE1(fuse, , vnops, vnop_getpages_error, "int");
	-/*
	- struct vnop_getpages_args {
	- struct vnode *a_vp;
	- vm_page_t *a_m;
	- int a_count;
	- int a_reqpage;
	- };
	-*/
	-static int
	-fuse_vnop_getpages(struct vop_getpages_args *ap)
	+static daddr_t
	+fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
	{
	- int i, error, nextoff, size, toff, count, npages;
	- struct uio uio;
	- struct iovec iov;
	- vm_offset_t kva;
	- struct buf *bp;
	- struct vnode *vp;
	- struct thread *td;
	- struct ucred *cred;
	- vm_page_t *pages;
	+ const int biosize = fuse_iosize(vp);

	- vp = ap->a_vp;
	- KASSERT(vp->v_object, ("objectless vp passed to getpages"));
	- td = curthread; /* XXX */
	- cred = curthread->td_ucred; /* XXX */
	- pages = ap->a_m;
	- npages = ap->a_count;
	+ return (off / biosize);
	+}

	- if (!fsess_opt_mmap(vnode_mount(vp))) {
	- SDT_PROBE2(fuse, , vnops, trace, 1,
	- "called on non-cacheable vnode??\n");
	- return (VM_PAGER_ERROR);
	- }
	+static int
	+fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn)
	+{
	+ off_t filesize;
	+ int blksz, err;
	+ const int biosize = fuse_iosize(vp);

	- /*
	- * If the last page is partially valid, just return it and allow
	- * the pager to zero-out the blanks. Partially valid pages can
	- * only occur at the file EOF.
	- *
	- * XXXGL: is that true for FUSE, which is a local filesystem,
	- * but still somewhat disconnected from the kernel?
	- */
	- VM_OBJECT_WLOCK(vp->v_object);
	- if (pages[npages - 1]->valid != 0 && --npages == 0)
	- goto out;
	- VM_OBJECT_WUNLOCK(vp->v_object);
	+ err = fuse_vnode_size(vp, &filesize, NULL, NULL);
	+ KASSERT(err == 0, ("vfs_bio_getpages can't handle errors here"));
	+ if (err)
	+ return biosize;

	- /*
	- * We use only the kva address for the buffer, but this is extremely
	- * convenient and fast.
	- */
	- bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK);
	-
	- kva = (vm_offset_t)bp->b_data;
	- pmap_qenter(kva, pages, npages);
	- VM_CNT_INC(v_vnodein);
	- VM_CNT_ADD(v_vnodepgsin, npages);
	-
	- count = npages << PAGE_SHIFT;
	- iov.iov_base = (caddr_t)kva;
	- iov.iov_len = count;
	- uio.uio_iov = &iov;
	- uio.uio_iovcnt = 1;
	- uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
	- uio.uio_resid = count;
	- uio.uio_segflg = UIO_SYSSPACE;
	- uio.uio_rw = UIO_READ;
	- uio.uio_td = td;
	-
	- error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred);
	- pmap_qremove(kva, npages);
	-
	- uma_zfree(fuse_pbuf_zone, bp);
	-
	- if (error && (uio.uio_resid == count)) {
	- SDT_PROBE1(fuse, , vnops, vnop_getpages_error, error);
	- return VM_PAGER_ERROR;
	+ if ((off_t)lbn * biosize >= filesize) {
	+ blksz = 0;
	+ } else if ((off_t)(lbn + 1) * biosize > filesize) {
	+ blksz = filesize - (off_t)lbn *biosize;
	+ } else {
	+ blksz = biosize;
	}
	- /*
	- * Calculate the number of bytes read and validate only that number
	- * of bytes. Note that due to pending writes, size may be 0. This
	- * does not mean that the remaining data is invalid!
	- */
	-
	- size = count - uio.uio_resid;
	- VM_OBJECT_WLOCK(vp->v_object);
	- fuse_vm_page_lock_queues();
	- for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
	- vm_page_t m;
	-
	- nextoff = toff + PAGE_SIZE;
	- m = pages[i];
	-
	- if (nextoff <= size) {
	- /*
	- * Read operation filled an entire page
	- */
	- m->valid = VM_PAGE_BITS_ALL;
	- KASSERT(m->dirty == 0,
	- ("fuse_getpages: page %p is dirty", m));
	- } else if (size > toff) {
	- /*
	- * Read operation filled a partial page.
	- */
	- m->valid = 0;
	- vm_page_set_valid_range(m, 0, size - toff);
	- KASSERT(m->dirty == 0,
	- ("fuse_getpages: page %p is dirty", m));
	- } else {
	- /*
	- * Read operation was short. If no error occurred
	- * we may have hit a zero-fill section. We simply
	- * leave valid set to 0.
	- */
	- ;
	- }
	- }
	- fuse_vm_page_unlock_queues();
	-out:
	- VM_OBJECT_WUNLOCK(vp->v_object);
	- if (ap->a_rbehind)
	- *ap->a_rbehind = 0;
	- if (ap->a_rahead)
	- *ap->a_rahead = 0;
	- return (VM_PAGER_OK);
	+ return (blksz);
	}

	/*
	- struct vnop_putpages_args {
	+ struct vnop_getpages_args {
	struct vnode *a_vp;
	vm_page_t *a_m;
	int a_count;
	- int a_sync;
	- int *a_rtvals;
	- vm_ooffset_t a_offset;
	+ int a_reqpage;
	};
	*/
	static int
	-fuse_vnop_putpages(struct vop_putpages_args *ap)
	+fuse_vnop_getpages(struct vop_getpages_args *ap)
	{
	- struct uio uio;
	- struct iovec iov;
	- vm_offset_t kva;
	- struct buf *bp;
	- int i, error, npages, count;
	- off_t offset;
	- int *rtvals;
	- struct vnode *vp;
	- struct thread *td;
	- struct ucred *cred;
	- vm_page_t *pages;
	- vm_ooffset_t fsize;
	+ struct vnode *vp = ap->a_vp;

	- vp = ap->a_vp;
	- KASSERT(vp->v_object, ("objectless vp passed to putpages"));
	- fsize = vp->v_object->un_pager.vnp.vnp_size;
	- td = curthread; /* XXX */
	- cred = curthread->td_ucred; /* XXX */
	- pages = ap->a_m;
	- count = ap->a_count;
	- rtvals = ap->a_rtvals;
	- npages = btoc(count);
	- offset = IDX_TO_OFF(pages[0]->pindex);
	-
	if (!fsess_opt_mmap(vnode_mount(vp))) {
	- SDT_PROBE2(fuse, , vnops, trace, 1,
	+ SDT_PROBE2(fusefs, , vnops, trace, 1,
	"called on non-cacheable vnode??\n");
	+ return (VM_PAGER_ERROR);
	}
	- for (i = 0; i < npages; i++)
	- rtvals[i] = VM_PAGER_AGAIN;

	- /*
	- * When putting pages, do not extend file past EOF.
	- */
	-
	- if (offset + count > fsize) {
	- count = fsize - offset;
	- if (count < 0)
	- count = 0;
	- }
	- /*
	- * We use only the kva address for the buffer, but this is extremely
	- * convenient and fast.
	- */
	- bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK);
	-
	- kva = (vm_offset_t)bp->b_data;
	- pmap_qenter(kva, pages, npages);
	- VM_CNT_INC(v_vnodeout);
	- VM_CNT_ADD(v_vnodepgsout, count);
	-
	- iov.iov_base = (caddr_t)kva;
	- iov.iov_len = count;
	- uio.uio_iov = &iov;
	- uio.uio_iovcnt = 1;
	- uio.uio_offset = offset;
	- uio.uio_resid = count;
	- uio.uio_segflg = UIO_SYSSPACE;
	- uio.uio_rw = UIO_WRITE;
	- uio.uio_td = td;
	-
	- error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred);
	-
	- pmap_qremove(kva, npages);
	- uma_zfree(fuse_pbuf_zone, bp);
	-
	- if (!error) {
	- int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
	-
	- for (i = 0; i < nwritten; i++) {
	- rtvals[i] = VM_PAGER_OK;
	- VM_OBJECT_WLOCK(pages[i]->object);
	- vm_page_undirty(pages[i]);
	- VM_OBJECT_WUNLOCK(pages[i]->object);
	- }
	- }
	- return rtvals[0];
	+ return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
	+ ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz));
	}

	static const char extattr_namespace_separator = '.';
	@@ -2023,6 +2018,13 @@
	if (fuse_isdeadfs(vp))
	return (ENXIO);

	+ if (!fsess_isimpl(mp, FUSE_GETXATTR))
	+ return EOPNOTSUPP;
	+
	+ err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD);
	+ if (err)
	+ return err;
	+
	/* Default to looking for user attributes. */
	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
	prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
	@@ -2053,8 +2055,10 @@

	err = fdisp_wait_answ(&fdi);
	if (err != 0) {
	- if (err == ENOSYS)
	+ if (err == ENOSYS) {
	fsess_set_notimpl(mp, FUSE_GETXATTR);
	+ err = EOPNOTSUPP;
	+ }
	goto out;
	}

	@@ -2100,6 +2104,29 @@
	if (fuse_isdeadfs(vp))
	return (ENXIO);

	+ if (!fsess_isimpl(mp, FUSE_SETXATTR))
	+ return EOPNOTSUPP;
	+
	+ if (vfs_isrdonly(mp))
	+ return EROFS;
	+
	+ /* Deleting xattrs must use VOP_DELETEEXTATTR instead */
	+ if (ap->a_uio == NULL) {
	+ /*
	+ * If we got here as fallback from VOP_DELETEEXTATTR, then
	+ * return EOPNOTSUPP.
	+ */
	+ if (!fsess_isimpl(mp, FUSE_REMOVEXATTR))
	+ return (EOPNOTSUPP);
	+ else
	+ return (EINVAL);
	+ }
	+
	+ err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td,
	+ VWRITE);
	+ if (err)
	+ return err;
	+
	/* Default to looking for user attributes. */
	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
	prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
	@@ -2127,11 +2154,14 @@

	err = fdisp_wait_answ(&fdi);

	- if (err != 0) {
	- if (err == ENOSYS)
	- fsess_set_notimpl(mp, FUSE_SETXATTR);
	- goto out;
	+ if (err == ENOSYS) {
	+ fsess_set_notimpl(mp, FUSE_SETXATTR);
	+ err = EOPNOTSUPP;
	}
	+ if (err == ERESTART) {
	+ /* Can't restart after calling uiomove */
	+ err = EINTR;
	+ }

	out:
	fdisp_destroy(&fdi);
	@@ -2227,6 +2257,13 @@
	if (fuse_isdeadfs(vp))
	return (ENXIO);

	+ if (!fsess_isimpl(mp, FUSE_LISTXATTR))
	+ return EOPNOTSUPP;
	+
	+ err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD);
	+ if (err)
	+ return err;
	+
	/*
	* Add space for a NUL and the period separator if enabled.
	* Default to looking for user attributes.
	@@ -2251,8 +2288,10 @@

	err = fdisp_wait_answ(&fdi);
	if (err != 0) {
	- if (err == ENOSYS)
	+ if (err == ENOSYS) {
	fsess_set_notimpl(mp, FUSE_LISTXATTR);
	+ err = EOPNOTSUPP;
	+ }
	goto out;
	}

	@@ -2267,7 +2306,7 @@
	/*
	* Retrieve Linux / FUSE compatible list values.
	*/
	- fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
	+ fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
	list_xattr_in = fdi.indata;
	list_xattr_in->size = linux_list_len + sizeof(*list_xattr_out);
	attr_str = (char )fdi.indata + sizeof(list_xattr_in);
	@@ -2330,6 +2369,17 @@
	if (fuse_isdeadfs(vp))
	return (ENXIO);

	+ if (!fsess_isimpl(mp, FUSE_REMOVEXATTR))
	+ return EOPNOTSUPP;
	+
	+ if (vfs_isrdonly(mp))
	+ return EROFS;
	+
	+ err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td,
	+ VWRITE);
	+ if (err)
	+ return err;
	+
	/* Default to looking for user attributes. */
	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
	prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
	@@ -2347,9 +2397,9 @@
	ap->a_name);

	err = fdisp_wait_answ(&fdi);
	- if (err != 0) {
	- if (err == ENOSYS)
	- fsess_set_notimpl(mp, FUSE_REMOVEXATTR);
	+ if (err == ENOSYS) {
	+ fsess_set_notimpl(mp, FUSE_REMOVEXATTR);
	+ err = EOPNOTSUPP;
	}

	fdisp_destroy(&fdi);
	@@ -2373,3 +2423,48 @@

	return 0;
	}
	+
	+/*
	+ * Get an NFS filehandle for a FUSE file.
	+ *
	+ * This will only work for FUSE file systems that guarantee the uniqueness of
	+ * nodeid:generation, which most don't.
	+ */
	+/*
	+vop_vptofh {
	+ IN struct vnode *a_vp;
	+ IN struct fid *a_fhp;
	+};
	+*/
	+static int
	+fuse_vnop_vptofh(struct vop_vptofh_args *ap)
	+{
	+ struct vnode *vp = ap->a_vp;
	+ struct fuse_vnode_data *fvdat = VTOFUD(vp);
	+ struct fuse_fid fhp = (struct fuse_fid )(ap->a_fhp);
	+ _Static_assert(sizeof(struct fuse_fid) <= sizeof(struct fid),
	+ "FUSE fid type is too big");
	+ struct mount *mp = vnode_mount(vp);
	+ struct fuse_data *data = fuse_get_mpdata(mp);
	+ struct vattr va;
	+ int err;
	+
	+ if (!(data->dataflags & FSESS_EXPORT_SUPPORT))
	+ return EOPNOTSUPP;
	+
	+ err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread);
	+ if (err)
	+ return err;
	+
	+ /ip = VTOI(ap->a_vp);/
	+ /ufhp = (struct ufid )ap->a_fhp;*/
	+ fhp->len = sizeof(struct fuse_fid);
	+ fhp->nid = fvdat->nid;
	+ if (fvdat->generation <= UINT32_MAX)
	+ fhp->gen = fvdat->generation;
	+ else
	+ return EOVERFLOW;
	+ return (0);
	+}
	+
	+
	Index: sys/kern/vfs_cache.c
	===================================================================
	--- sys/kern/vfs_cache.c
	+++ sys/kern/vfs_cache.c
	@@ -1964,7 +1964,7 @@
	}

	/*
	- * Invalidate all entries to a particular vnode.
	+ * Invalidate all entries from and to a particular vnode.
	*/
	void
	cache_purge(struct vnode *vp)
	Index: sys/kern/vfs_subr.c
	===================================================================
	--- sys/kern/vfs_subr.c
	+++ sys/kern/vfs_subr.c
	@@ -118,6 +118,8 @@
	static void vfs_knl_assert_unlocked(void *arg);
	static void vnlru_return_batches(struct vfsops *mnt_op);
	static void destroy_vpollinfo(struct vpollinfo *vi);
	+static int v_inval_buf_range1(struct vnode vp, struct bufobj bo,
	+ daddr_t startlbn, daddr_t endlbn);

	/*
	* These fences are intended for cases where some synchronization is
	@@ -945,6 +947,12 @@
	* desirable to reuse such vnodes. These conditions may cause the
	* number of vnodes to reach some minimum value regardless of what
	* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
	+ *
	+ * @param mp Try to reclaim vnodes from this mountpoint
	+ * @param reclaim_nc_src Only reclaim directories with outgoing namecache
	+ * entries if this argument is strue
	+ * @param reclaim_free Only reclaim free vnodes if this is set.
	+ * @return The number of vnodes that were reclaimed.
	*/
	static int
	vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
	@@ -1954,9 +1962,8 @@
	vtruncbuf(struct vnode *vp, off_t length, int blksize)
	{
	struct buf bp, nbp;
	- int anyfreed;
	- daddr_t trunclbn;
	struct bufobj *bo;
	+ daddr_t startlbn;

	CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
	vp, blksize, (uintmax_t)length);
	@@ -1964,22 +1971,114 @@
	/*
	* Round up to the next lbn.
	*/
	- trunclbn = howmany(length, blksize);
	+ startlbn = howmany(length, blksize);

	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
	+
	restart:
	bo = &vp->v_bufobj;
	BO_LOCK(bo);
	+ if (v_inval_buf_range1(vp, bo, startlbn, INT64_MAX) == EAGAIN)
	+ goto restart;
	+
	+ if (length > 0) {
	+restartsync:
	+ TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
	+ if (bp->b_lblkno > 0)
	+ continue;
	+ /*
	+ * Since we hold the vnode lock this should only
	+ * fail if we're racing with the buf daemon.
	+ */
	+ if (BUF_LOCK(bp,
	+ LK_EXCLUSIVE \| LK_SLEEPFAIL \| LK_INTERLOCK,
	+ BO_LOCKPTR(bo)) == ENOLCK) {
	+ goto restart;
	+ }
	+ VNASSERT((bp->b_flags & B_DELWRI), vp,
	+ ("buf(%p) on dirty queue without DELWRI", bp));
	+
	+ bremfree(bp);
	+ bawrite(bp);
	+ BO_LOCK(bo);
	+ goto restartsync;
	+ }
	+ }
	+
	+ bufobj_wwait(bo, 0, 0);
	+ BO_UNLOCK(bo);
	+ vnode_pager_setsize(vp, length);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Invalidate the cached pages of a file's buffer within the range of block
	+ * numbers [startlbn, endlbn). Every buffer that overlaps that range will be
	+ * invalidated. This must not result in any dirty data being lost.
	+ */
	+void
	+v_inval_buf_range(struct vnode *vp, off_t start, off_t end, int blksize)
	+{
	+ struct bufobj *bo;
	+ daddr_t startlbn, endlbn;
	+ vm_pindex_t startp, endp;
	+
	+ /* Round "outwards" */
	+ startlbn = start / blksize;
	+ endlbn = howmany(end, blksize);
	+ startp = OFF_TO_IDX(start);
	+ endp = OFF_TO_IDX(end + PAGE_SIZE - 1);
	+
	+ ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
	+
	+restart:
	+ bo = &vp->v_bufobj;
	+ BO_LOCK(bo);
	+
	+#ifdef INVARIANTS
	+ struct buf bp, nbp;
	+
	+ TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
	+ /*
	+ * Disallow invalidating dirty data outside of the requested
	+ * offsets. Assume that data within the requested offsets is
	+ * being invalidated for a good reason.
	+ */
	+ off_t blkstart, blkend;
	+
	+ blkstart = bp->b_offset;
	+ blkend = bp->b_offset + bp->b_bcount;
	+ KASSERT(blkstart >= start && blkend <= end,
	+ ("Invalidating extra dirty data!"));
	+ }
	+#endif
	+
	+ if (v_inval_buf_range1(vp, bo, startlbn, endlbn) == EAGAIN)
	+ goto restart;
	+
	+ BO_UNLOCK(bo);
	+ vn_pages_remove(vp, startp, endp);
	+}
	+
	+/* Like v_inval_buf_range, but operates on whole buffers instead of offsets */
	+static int
	+v_inval_buf_range1(struct vnode vp, struct bufobj bo,
	+ daddr_t startlbn, daddr_t endlbn)
	+{
	+ struct buf bp, nbp;
	+ int anyfreed;
	+
	anyfreed = 1;
	for (;anyfreed;) {
	anyfreed = 0;
	TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
	- if (bp->b_lblkno < trunclbn)
	+ if (bp->b_lblkno < startlbn \|\| bp->b_lblkno >= endlbn)
	continue;
	if (BUF_LOCK(bp,
	LK_EXCLUSIVE \| LK_SLEEPFAIL \| LK_INTERLOCK,
	BO_LOCKPTR(bo)) == ENOLCK)
	- goto restart;
	+ return EAGAIN;

	bremfree(bp);
	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	@@ -1993,17 +2092,17 @@
	(nbp->b_vp != vp) \|\|
	(nbp->b_flags & B_DELWRI))) {
	BO_UNLOCK(bo);
	- goto restart;
	+ return EAGAIN;
	}
	}

	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
	- if (bp->b_lblkno < trunclbn)
	+ if (bp->b_lblkno < startlbn \|\| bp->b_lblkno >= endlbn)
	continue;
	if (BUF_LOCK(bp,
	LK_EXCLUSIVE \| LK_SLEEPFAIL \| LK_INTERLOCK,
	BO_LOCKPTR(bo)) == ENOLCK)
	- goto restart;
	+ return EAGAIN;
	bremfree(bp);
	bp->b_flags \|= (B_INVAL \| B_RELBUF);
	bp->b_flags &= ~B_ASYNC;
	@@ -2016,40 +2115,11 @@
	(nbp->b_vp != vp) \|\|
	(nbp->b_flags & B_DELWRI) == 0)) {
	BO_UNLOCK(bo);
	- goto restart;
	+ return EAGAIN;
	}
	}
	}
	-
	- if (length > 0) {
	-restartsync:
	- TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
	- if (bp->b_lblkno > 0)
	- continue;
	- /*
	- * Since we hold the vnode lock this should only
	- * fail if we're racing with the buf daemon.
	- */
	- if (BUF_LOCK(bp,
	- LK_EXCLUSIVE \| LK_SLEEPFAIL \| LK_INTERLOCK,
	- BO_LOCKPTR(bo)) == ENOLCK) {
	- goto restart;
	- }
	- VNASSERT((bp->b_flags & B_DELWRI), vp,
	- ("buf(%p) on dirty queue without DELWRI", bp));
	-
	- bremfree(bp);
	- bawrite(bp);
	- BO_LOCK(bo);
	- goto restartsync;
	- }
	- }
	-
	- bufobj_wwait(bo, 0, 0);
	- BO_UNLOCK(bo);
	- vnode_pager_setsize(vp, length);
	-
	- return (0);
	+ return 0;
	}

	static void
	Index: sys/sys/vnode.h
	===================================================================
	--- sys/sys/vnode.h
	+++ sys/sys/vnode.h
	@@ -659,6 +659,8 @@
	void vinactive(struct vnode , struct thread );
	int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
	int vtruncbuf(struct vnode *vp, off_t length, int blksize);
	+void v_inval_buf_range(struct vnode *vp, off_t start, off_t end,
	+ int blksize);
	void vunref(struct vnode *);
	void vn_printf(struct vnode vp, const char fmt, ...) __printflike(2,3);
	int vrecycle(struct vnode *vp);
	Index: tests/sys/fs/Makefile
	===================================================================
	--- tests/sys/fs/Makefile
	+++ tests/sys/fs/Makefile
	@@ -1,5 +1,7 @@
	# $FreeBSD$

	+.include <bsd.compiler.mk>
	+
	PACKAGE= tests

	TESTSDIR= ${TESTSBASE}/sys/fs
	@@ -7,6 +9,9 @@
	TESTSRC= ${SRCTOP}/contrib/netbsd-tests/fs

	#TESTS_SUBDIRS+= nullfs # XXX: needs rump
	+.if ${COMPILER_FEATURES:Mc++14}
	+TESTS_SUBDIRS+= fusefs
	+.endif
	TESTS_SUBDIRS+= tmpfs

	${PACKAGE}FILES+= h_funcs.subr

File Metadata

Mime Type: text/plain
Expires: Fri, Jan 16, 5:55 PM (14 h, 2 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 27664580
Default Alt Text: D20940.diff (268 KB)

D20940.diffNo OneTemporaryActions

D20940.diffView Options

File Metadata

Event Timeline

D20940.diff
No OneTemporary
Actions

D20940.diff
View Options