Index: UPDATING =================================================================== --- UPDATING +++ UPDATING @@ -31,6 +31,18 @@ disable the most expensive debugging functionality run "ln -s 'abort:false,junk:false' /etc/malloc.conf".) +20190627: + The vfs.fusefs.sync_unmount and vfs.fusefs.init_backgrounded sysctls + and the "-o sync_unmount" and "-o init_backgrounded" mount options have + been removed from mount_fusefs(8). You can safely remove them from + your scripts, because they had no effect. + + The vfs.fusefs.fix_broken_io, vfs.fusefs.sync_resize, + vfs.fusefs.refresh_size, vfs.fusefs.mmap_enable, + vfs.fusefs.reclaim_revoked, and vfs.fusefs.data_cache_invalidate + sysctls have been removed. If you felt the need to set any of them to + a non-default value, please tell asomers@FreeBSD.org why. + 20190620: Entropy collection and the /dev/random device are no longer optional components. The "device random" option has been removed. Index: etc/mtree/BSD.tests.dist =================================================================== --- etc/mtree/BSD.tests.dist +++ etc/mtree/BSD.tests.dist @@ -731,6 +731,8 @@ file .. fs + fusefs + .. tmpfs .. .. Index: lib/libc/gen/getvfsbyname.c =================================================================== --- lib/libc/gen/getvfsbyname.c +++ lib/libc/gen/getvfsbyname.c @@ -37,10 +37,27 @@ #include #include #include +#include #include #include /* + * fusefs(5) file systems may have a "subtype" which gets appended to + * statfs(2)'s f_fstypename field on a per-mount basis. Allow getvfsbyname to + * match either the full "fusefs.foobar" or the more general "fusefs". + */ +static bool +are_fusefs(const char *fsname, const char *vfc_name) +{ + const static char fusefs[] = "fusefs"; + const static char fusefs_dot[] = "fusefs."; + + + return (strncmp(fsname, fusefs_dot, sizeof(fusefs_dot) - 1) == 0 && + strcmp(fusefs, vfc_name) == 0); +} + +/* * Given a filesystem name, determine if it is resident in the kernel, * and if it is resident, return its xvfsconf structure. */ @@ -62,7 +79,9 @@ } cnt = buflen / sizeof(struct xvfsconf); for (i = 0; i < cnt; i++) { - if (strcmp(fsname, xvfsp[i].vfc_name) == 0) { + if (strcmp(fsname, xvfsp[i].vfc_name) == 0 || + are_fusefs(fsname, xvfsp[i].vfc_name)) + { memcpy(vfcp, xvfsp + i, sizeof(struct xvfsconf)); free(xvfsp); return (0); Index: sbin/mount_fusefs/mount_fusefs.8 =================================================================== --- sbin/mount_fusefs/mount_fusefs.8 +++ sbin/mount_fusefs/mount_fusefs.8 @@ -3,6 +3,11 @@ .\" Copyright (c) 2005, 2006 Csaba Henk .\" All rights reserved. .\" +.\" Copyright (c) 2019 The FreeBSD Foundation +.\" +.\" Portions of this documentation were written by BFF Storage Systems under +.\" sponsorship from the FreeBSD Foundation. +.\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: @@ -29,7 +34,7 @@ .\" .\" $FreeBSD$ .\" -.Dd November 17, 2018 +.Dd June 14, 2019 .Dt MOUNT_FUSEFS 8 .Os .Sh NAME @@ -136,23 +141,33 @@ by prefixing them with .Dq no ) : .Bl -tag -width indent -.It Cm default_permissions -Enable traditional (file mode based) permission checking in kernel .It Cm allow_other Do not apply .Sx STRICT ACCESS POLICY . Only root can use this option +.It Cm async +I/O to the file system may be done asynchronously. +Writes may delayed and/or reordered. +.It Cm default_permissions +Enable traditional (file mode based) permission checking in kernel .It Cm max_read Ns = Ns Ar n Limit size of read requests to .Ar n +.It Cm neglect_shares +Do not refuse unmounting if there are secondary mounts .It Cm private Refuse shared mounting of the daemon. This is the default behaviour, to allow sharing, expicitly use .Fl o Cm noprivate -.It Cm neglect_shares -Do not refuse unmounting if there are secondary mounts .It Cm push_symlinks_in Prefix absolute symlinks with the mountpoint +.It Cm subtype Ns = Ns Ar fsname +Suffix +.Ar fsname +to the file system name as reported by +.Xr statfs 2 . +This option can be used to identify the file system implemented by +.Ar fuse_daemon . .El .El .Pp Index: sbin/mount_fusefs/mount_fusefs.c =================================================================== --- sbin/mount_fusefs/mount_fusefs.c +++ sbin/mount_fusefs/mount_fusefs.c @@ -5,6 +5,11 @@ * Copyright (c) 2005 Csaba Henk * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -60,7 +65,6 @@ void usage(void); void helpmsg(void); void showversion(void); -int init_backgrounded(void); static struct mntopt mopts[] = { #define ALTF_PRIVATE 0x01 @@ -73,8 +77,6 @@ { "max_read=", 0, ALTF_MAXREAD, 1 }, #define ALTF_SUBTYPE 0x40 { "subtype=", 0, ALTF_SUBTYPE, 1 }, - #define ALTF_SYNC_UNMOUNT 0x80 - { "sync_unmount", 0, ALTF_SYNC_UNMOUNT, 1 }, /* * MOPT_AUTOMOUNTED, included by MOPT_STDOPTS, does not fit into * the 'flags' argument to nmount(2). We have to abuse altflags @@ -91,6 +93,8 @@ { "large_read", 0, 0x00, 1 }, /* "nonempty", just the first two chars are stripped off during parsing */ { "nempty", 0, 0x00, 1 }, + { "async", 0, MNT_ASYNC, 0}, + { "noasync", 1, MNT_ASYNC, 0}, MOPT_STDOPTS, MOPT_END }; @@ -107,7 +111,7 @@ { 0, NULL, 0 } }; -#define DEFAULT_MOUNT_FLAGS ALTF_PRIVATE | ALTF_SYNC_UNMOUNT +#define DEFAULT_MOUNT_FLAGS ALTF_PRIVATE int main(int argc, char *argv[]) @@ -409,12 +413,6 @@ } } - if (fd >= 0 && ! init_backgrounded() && close(fd) < 0) { - if (pid) - kill(pid, SIGKILL); - err(1, "failed to close fuse device"); - } - /* Prepare the options vector for nmount(). build_iovec() is declared * in mntopts.h. */ sprintf(fdstr, "%d", fd); @@ -481,7 +479,6 @@ " -o neglect_shares don't report EBUSY when unmount attempted\n" " in presence of secondary mounts\n" " -o push_symlinks_in prefix absolute symlinks with mountpoint\n" - " -o sync_unmount do unmount synchronously\n" ); exit(EX_USAGE); } @@ -491,18 +488,4 @@ { puts("mount_fusefs [fuse4bsd] version: " FUSE4BSD_VERSION); exit(EX_USAGE); -} - -int -init_backgrounded(void) -{ - int ibg; - size_t len; - - len = sizeof(ibg); - - if (sysctlbyname("vfs.fusefs.init_backgrounded", &ibg, &len, NULL, 0)) - return (0); - - return (ibg); } Index: share/man/man5/fusefs.5 =================================================================== --- share/man/man5/fusefs.5 +++ share/man/man5/fusefs.5 @@ -3,8 +3,8 @@ .\" .\" Copyright (c) 2019 The FreeBSD Foundation .\" -.\" This software was developed by BFF Storage Systems, LLC under sponsorship -.\" from the FreeBSD Foundation. +.\" This documentation was written by BFF Storage Systems, LLC under +.\" sponsorship from the FreeBSD Foundation. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions @@ -28,7 +28,7 @@ .\" SUCH DAMAGE. .\" .\" $FreeBSD$ -.Dd April 13, 2019 +.Dd June 27, 2019 .Dt FUSEFS 5 .Os .Sh NAME @@ -60,11 +60,9 @@ API is portable. Many daemons can run on multiple operating systems with minimal modifications. .Sh SYSCTL VARIABLES -The following variables are available as both +The following .Xr sysctl 8 -variables and -.Xr loader 8 -tunables: +variables are available: .Bl -tag -width indent .It Va vfs.fusefs.kernelabi_major Major version of the FUSE kernel ABI supported by this driver. @@ -73,7 +71,7 @@ .It Va vfs.fusefs.data_cache_mode Controls how .Nm -will cache file data. +will cache file data for pre-7.23 file systems. A value of 0 will disable caching entirely. Every data access will be forwarded to the daemon. A value of 1 will select write-through caching. @@ -84,33 +82,25 @@ to the daemon by the page daemon. Write-back caching is usually unsafe, especially for FUSE file systems that require network access. -.It Va vfs.fusefs.lookup_cache_enable -Controls whether -.Nm -will cache lookup responses from the file system. -FUSE file systems indicate whether lookup responses should be cacheable, but -it may be useful to globally disable caching them if a file system is -misbehaving. +.Pp +FUSE file systems using protocol 7.23 or later specify their cache behavior +on a per-mountpoint basis, ignoring this sysctl. +.It Va vfs.fusefs.stats.filehandle_count +Current number of open FUSE file handles. +.It Va vfs.fusefs.stats.lookup_cache_hits +Total number of lookup cache hits. +.It Va vfs.fusefs.stats.lookup_cache_misses +Total number of lookup cache misses. +.It Va vfs.fusefs.stats.node_count +Current number of allocated FUSE vnodes. +.It Va vfs.fusefs.stats.ticket_count +Current number of allocated FUSE tickets, which is roughly equal to the number +number of FUSE operations currently being processed by daemons. .\" Undocumented sysctls .\" ==================== -.\" Counters: I intend to rename to vfs.fusefs.stats.* for clarity -.\" vfs.fusefs.lookup_cache_{hits, misses} -.\" vfs.fusefs.filehandle_count -.\" vfs.fusefs.ticker_count -.\" vfs.fusefs.node_count -.\" -.\" vfs.fusefs.version - useless since the driver moved in-tree -.\" vfs.fusefs.reclaim_revoked: I don't understand it well-enough -.\" vfs.fusefs.sync_unmount: dead code .\" vfs.fusefs.enforce_dev_perms: I don't understand it well enough. -.\" vfs.fusefs.init_backgrounded: dead code .\" vfs.fusefs.iov_credit: I don't understand it well enough .\" vfs.fusefs.iov_permanent_bufsize: I don't understand it well enough -.\" vfs.fusefs.fix_broken_io: I don't understand it well enough -.\" vfs.fusefs.sync_resize: useless and should be removed -.\" vfs.fusefs.refresh_size: probably useless? -.\" vfs.fusefs.mmap_enable: why is this optional? -.\" vfs.fusefs.data_cache_invalidate: what is this needed for? .Sh SEE ALSO .Xr mount_fusefs 8 .Sh HISTORY Index: share/man/man9/VOP_FSYNC.9 =================================================================== --- share/man/man9/VOP_FSYNC.9 +++ share/man/man9/VOP_FSYNC.9 @@ -4,6 +4,11 @@ .\" .\" All rights reserved. .\" +.\" Copyright (c) 2019 The FreeBSD Foundation +.\" +.\" Portions of this documentation were written by BFF Storage Systems under +.\" sponsorship from the FreeBSD Foundation. +.\" .\" This program is free software. .\" .\" Redistribution and use in source and binary forms, with or without Index: share/mk/bsd.compiler.mk =================================================================== --- share/mk/bsd.compiler.mk +++ share/mk/bsd.compiler.mk @@ -19,6 +19,7 @@ # COMPILER_FEATURES will contain one or more of the following, based on # compiler support for that feature: # +# - c++14: supports full (or nearly full) C++14 programming environment. # - c++11: supports full (or nearly full) C++11 programming environment. # - retpoline: supports the retpoline speculative execution vulnerability # mitigation. @@ -200,6 +201,10 @@ .endif ${X_}COMPILER_FEATURES= +.if ${${X_}COMPILER_TYPE} == "clang" || \ + (${${X_}COMPILER_TYPE} == "gcc" && ${${X_}COMPILER_VERSION} >= 50000) +${X_}COMPILER_FEATURES+= c++14 +.endif .if ${${X_}COMPILER_TYPE} == "clang" || \ (${${X_}COMPILER_TYPE} == "gcc" && ${${X_}COMPILER_VERSION} >= 40800) ${X_}COMPILER_FEATURES+= c++11 Index: sys/fs/fuse/fuse.h =================================================================== --- sys/fs/fuse/fuse.h +++ sys/fs/fuse/fuse.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -63,87 +68,10 @@ #define FUSE_MIN_DAEMON_TIMEOUT 0 /* s */ #define FUSE_MAX_DAEMON_TIMEOUT 600 /* s */ -#ifndef FUSE_FREEBSD_VERSION -#define FUSE_FREEBSD_VERSION "0.4.4" -#endif - -/* Mapping versions to features */ - -#define FUSE_KERNELABI_GEQ(maj, min) \ -(FUSE_KERNEL_VERSION > (maj) || (FUSE_KERNEL_VERSION == (maj) && FUSE_KERNEL_MINOR_VERSION >= (min))) - -/* - * Appearance of new FUSE operations is not always in par with version - * numbering... At least, 7.3 is a sufficient condition for having - * FUSE_{ACCESS,CREATE}. - */ -#if FUSE_KERNELABI_GEQ(7, 3) -#ifndef FUSE_HAS_ACCESS -#define FUSE_HAS_ACCESS 1 -#endif -#ifndef FUSE_HAS_CREATE -#define FUSE_HAS_CREATE 1 -#endif -#else /* FUSE_KERNELABI_GEQ(7, 3) */ -#ifndef FUSE_HAS_ACCESS -#define FUSE_HAS_ACCESS 0 -#endif -#ifndef FUSE_HAS_CREATE -#define FUSE_HAS_CREATE 0 -#endif -#endif - -#if FUSE_KERNELABI_GEQ(7, 7) -#ifndef FUSE_HAS_GETLK -#define FUSE_HAS_GETLK 1 -#endif -#ifndef FUSE_HAS_SETLK -#define FUSE_HAS_SETLK 1 -#endif -#ifndef FUSE_HAS_SETLKW -#define FUSE_HAS_SETLKW 1 -#endif -#ifndef FUSE_HAS_INTERRUPT -#define FUSE_HAS_INTERRUPT 1 -#endif -#else /* FUSE_KERNELABI_GEQ(7, 7) */ -#ifndef FUSE_HAS_GETLK -#define FUSE_HAS_GETLK 0 -#endif -#ifndef FUSE_HAS_SETLK -#define FUSE_HAS_SETLK 0 -#endif -#ifndef FUSE_HAS_SETLKW -#define FUSE_HAS_SETLKW 0 -#endif -#ifndef FUSE_HAS_INTERRUPT -#define FUSE_HAS_INTERRUPT 0 -#endif -#endif - -#if FUSE_KERNELABI_GEQ(7, 8) -#ifndef FUSE_HAS_FLUSH_RELEASE -#define FUSE_HAS_FLUSH_RELEASE 1 -/* - * "DESTROY" came in the middle of the 7.8 era, - * so this is not completely exact... - */ -#ifndef FUSE_HAS_DESTROY -#define FUSE_HAS_DESTROY 1 -#endif -#endif -#else /* FUSE_KERNELABI_GEQ(7, 8) */ -#ifndef FUSE_HAS_FLUSH_RELEASE -#define FUSE_HAS_FLUSH_RELEASE 0 -#ifndef FUSE_HAS_DESTROY -#define FUSE_HAS_DESTROY 0 -#endif -#endif -#endif - /* misc */ SYSCTL_DECL(_vfs_fusefs); +SYSCTL_DECL(_vfs_fusefs_stats); /* Fuse locking */ Index: sys/fs/fuse/fuse_device.c =================================================================== --- sys/fs/fuse/fuse_device.c +++ sys/fs/fuse/fuse_device.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -81,27 +86,28 @@ #include #include "fuse.h" +#include "fuse_internal.h" #include "fuse_ipc.h" -SDT_PROVIDER_DECLARE(fuse); +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , device, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , device, trace, "int", "char*"); static struct cdev *fuse_dev; +static d_kqfilter_t fuse_device_filter; static d_open_t fuse_device_open; -static d_close_t fuse_device_close; static d_poll_t fuse_device_poll; static d_read_t fuse_device_read; static d_write_t fuse_device_write; static struct cdevsw fuse_device_cdevsw = { + .d_kqfilter = fuse_device_filter, .d_open = fuse_device_open, - .d_close = fuse_device_close, .d_name = "fuse", .d_poll = fuse_device_poll, .d_read = fuse_device_read, @@ -109,6 +115,15 @@ .d_version = D_VERSION, }; +static int fuse_device_filt_read(struct knote *kn, long hint); +static void fuse_device_filt_detach(struct knote *kn); + +struct filterops fuse_device_rfiltops = { + .f_isfd = 1, + .f_detach = fuse_device_filt_detach, + .f_event = fuse_device_filt_read, +}; + /**************************** * * >>> Fuse device op defs @@ -119,11 +134,100 @@ fdata_dtor(void *arg) { struct fuse_data *fdata; + struct fuse_ticket *tick; fdata = arg; + if (fdata == NULL) + return; + + fdata_set_dead(fdata); + + FUSE_LOCK(); + fuse_lck_mtx_lock(fdata->aw_mtx); + /* wakup poll()ers */ + selwakeuppri(&fdata->ks_rsel, PZERO + 1); + /* Don't let syscall handlers wait in vain */ + while ((tick = fuse_aw_pop(fdata))) { + fuse_lck_mtx_lock(tick->tk_aw_mtx); + fticket_set_answered(tick); + tick->tk_aw_errno = ENOTCONN; + wakeup(tick); + fuse_lck_mtx_unlock(tick->tk_aw_mtx); + FUSE_ASSERT_AW_DONE(tick); + fuse_ticket_drop(tick); + } + fuse_lck_mtx_unlock(fdata->aw_mtx); + + /* Cleanup unsent operations */ + fuse_lck_mtx_lock(fdata->ms_mtx); + while ((tick = fuse_ms_pop(fdata))) { + fuse_ticket_drop(tick); + } + fuse_lck_mtx_unlock(fdata->ms_mtx); + FUSE_UNLOCK(); + fdata_trydestroy(fdata); } +static int +fuse_device_filter(struct cdev *dev, struct knote *kn) +{ + struct fuse_data *data; + int error; + + error = devfs_get_cdevpriv((void **)&data); + + /* EVFILT_WRITE is not supported; the device is always ready to write */ + if (error == 0 && kn->kn_filter == EVFILT_READ) { + kn->kn_fop = &fuse_device_rfiltops; + kn->kn_hook = data; + knlist_add(&data->ks_rsel.si_note, kn, 0); + error = 0; + } else if (error == 0) { + error = EINVAL; + kn->kn_data = error; + } + + return (error); +} + +static void +fuse_device_filt_detach(struct knote *kn) +{ + struct fuse_data *data; + + data = (struct fuse_data*)kn->kn_hook; + MPASS(data != NULL); + knlist_remove(&data->ks_rsel.si_note, kn, 0); + kn->kn_hook = NULL; +} + +static int +fuse_device_filt_read(struct knote *kn, long hint) +{ + struct fuse_data *data; + int ready; + + data = (struct fuse_data*)kn->kn_hook; + MPASS(data != NULL); + + mtx_assert(&data->ms_mtx, MA_OWNED); + if (fdata_get_dead(data)) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = ENODEV; + kn->kn_data = 1; + ready = 1; + } else if (STAILQ_FIRST(&data->ms_head)) { + MPASS(data->ms_count >= 1); + kn->kn_data = data->ms_count; + ready = 1; + } else { + ready = 0; + } + + return (ready); +} + /* * Resources are set up on a per-open basis */ @@ -133,52 +237,17 @@ struct fuse_data *fdata; int error; - SDT_PROBE2(fuse, , device, trace, 1, "device open"); + SDT_PROBE2(fusefs, , device, trace, 1, "device open"); fdata = fdata_alloc(dev, td->td_ucred); error = devfs_set_cdevpriv(fdata, fdata_dtor); if (error != 0) fdata_trydestroy(fdata); else - SDT_PROBE2(fuse, , device, trace, 1, "device open success"); + SDT_PROBE2(fusefs, , device, trace, 1, "device open success"); return (error); } -static int -fuse_device_close(struct cdev *dev, int fflag, int devtype, struct thread *td) -{ - struct fuse_data *data; - struct fuse_ticket *tick; - int error; - - error = devfs_get_cdevpriv((void **)&data); - if (error != 0) - return (error); - if (!data) - panic("no fuse data upon fuse device close"); - fdata_set_dead(data); - - FUSE_LOCK(); - fuse_lck_mtx_lock(data->aw_mtx); - /* wakup poll()ers */ - selwakeuppri(&data->ks_rsel, PZERO + 1); - /* Don't let syscall handlers wait in vain */ - while ((tick = fuse_aw_pop(data))) { - fuse_lck_mtx_lock(tick->tk_aw_mtx); - fticket_set_answered(tick); - tick->tk_aw_errno = ENOTCONN; - wakeup(tick); - fuse_lck_mtx_unlock(tick->tk_aw_mtx); - FUSE_ASSERT_AW_DONE(tick); - fuse_ticket_drop(tick); - } - fuse_lck_mtx_unlock(data->aw_mtx); - FUSE_UNLOCK(); - - SDT_PROBE2(fuse, , device, trace, 1, "device close"); - return (0); -} - int fuse_device_poll(struct cdev *dev, int events, struct thread *td) { @@ -219,7 +288,7 @@ int buflen[3]; int i; - SDT_PROBE2(fuse, , device, trace, 1, "fuse device read"); + SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read"); err = devfs_get_cdevpriv((void **)&data); if (err != 0) @@ -228,7 +297,7 @@ fuse_lck_mtx_lock(data->ms_mtx); again: if (fdata_get_dead(data)) { - SDT_PROBE2(fuse, , device, trace, 2, + SDT_PROBE2(fusefs, , device, trace, 2, "we know early on that reader should be kicked so we " "don't wait for news"); fuse_lck_mtx_unlock(data->ms_mtx); @@ -256,7 +325,7 @@ * -- and some other cases, too, tho not totally clear, when * (cv_signal/wakeup_one signals the whole process ?) */ - SDT_PROBE2(fuse, , device, trace, 1, "no message on thread"); + SDT_PROBE2(fusefs, , device, trace, 1, "no message on thread"); goto again; } fuse_lck_mtx_unlock(data->ms_mtx); @@ -266,9 +335,10 @@ * somebody somewhere -- eg., umount routine -- * wants this liaison finished off */ - SDT_PROBE2(fuse, , device, trace, 2, "reader is to be sacked"); + SDT_PROBE2(fusefs, , device, trace, 2, + "reader is to be sacked"); if (tick) { - SDT_PROBE2(fuse, , device, trace, 2, "weird -- " + SDT_PROBE2(fusefs, , device, trace, 2, "weird -- " "\"kick\" is set tho there is message"); FUSE_ASSERT_MS_DONE(tick); fuse_ticket_drop(tick); @@ -276,7 +346,7 @@ return (ENODEV); /* This should make the daemon get off * of us */ } - SDT_PROBE2(fuse, , device, trace, 1, + SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read message successfully"); KASSERT(tick->tk_ms_bufdata || tick->tk_ms_bufsize == 0, @@ -311,7 +381,7 @@ */ if (uio->uio_resid < buflen[i]) { fdata_set_dead(data); - SDT_PROBE2(fuse, , device, trace, 2, + SDT_PROBE2(fusefs, , device, trace, 2, "daemon is stupid, kick it off..."); err = ENODEV; break; @@ -331,23 +401,26 @@ fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio) { if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) { - SDT_PROBE2(fuse, , device, trace, 1, "Format error: body size " + SDT_PROBE2(fusefs, , device, trace, 1, + "Format error: body size " "differs from size claimed by header"); return (EINVAL); } - if (uio->uio_resid && ohead->error) { - SDT_PROBE2(fuse, , device, trace, 1, + if (uio->uio_resid && ohead->unique != 0 && ohead->error) { + SDT_PROBE2(fusefs, , device, trace, 1, "Format error: non zero error but message had a body"); return (EINVAL); } - /* Sanitize the linuxism of negative errnos */ - ohead->error = -(ohead->error); return (0); } -SDT_PROBE_DEFINE1(fuse, , device, fuse_device_write_bumped_into_callback, - "uint64_t"); +SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_notify, + "struct fuse_out_header*"); +SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_missing_ticket, + "uint64_t"); +SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_found, + "struct fuse_ticket*"); /* * fuse_device_write first reads the header sent by the daemon. * If that's OK, looks up ticket/callback node by the unique id seen in header. @@ -360,15 +433,17 @@ struct fuse_out_header ohead; int err = 0; struct fuse_data *data; - struct fuse_ticket *tick, *x_tick; + struct mount *mp; + struct fuse_ticket *tick, *itick, *x_tick; int found = 0; err = devfs_get_cdevpriv((void **)&data); if (err != 0) return (err); + mp = data->mp; if (uio->uio_resid < sizeof(struct fuse_out_header)) { - SDT_PROBE2(fuse, , device, trace, 1, + SDT_PROBE2(fusefs, , device, trace, 1, "fuse_device_write got less than a header!"); fdata_set_dead(data); return (EINVAL); @@ -393,15 +468,29 @@ fuse_lck_mtx_lock(data->aw_mtx); TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link, x_tick) { - SDT_PROBE1(fuse, , device, - fuse_device_write_bumped_into_callback, - tick->tk_unique); if (tick->tk_unique == ohead.unique) { + SDT_PROBE1(fusefs, , device, fuse_device_write_found, + tick); found = 1; fuse_aw_remove(tick); break; } } + if (found && tick->irq_unique > 0) { + /* + * Discard the FUSE_INTERRUPT ticket that tried to interrupt + * this operation + */ + TAILQ_FOREACH_SAFE(itick, &data->aw_head, tk_aw_link, + x_tick) { + if (itick->tk_unique == tick->irq_unique) { + fuse_aw_remove(itick); + fuse_ticket_drop(itick); + break; + } + } + tick->irq_unique = 0; + } fuse_lck_mtx_unlock(data->aw_mtx); if (found) { @@ -414,13 +503,15 @@ * via ticket_drop(), so no manual mucking * around...) */ - SDT_PROBE2(fuse, , device, trace, 1, + SDT_PROBE2(fusefs, , device, trace, 1, "pass ticket to a callback"); + /* Sanitize the linuxism of negative errnos */ + ohead.error *= -1; memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead)); err = tick->tk_aw_handler(tick, uio); } else { /* pretender doesn't wanna do anything with answer */ - SDT_PROBE2(fuse, , device, trace, 1, + SDT_PROBE2(fusefs, , device, trace, 1, "stuff devalidated, so we drop it"); } @@ -430,11 +521,51 @@ * because fuse_ticket_drop() will deal with refcount anyway. */ fuse_ticket_drop(tick); + } else if (ohead.unique == 0){ + /* unique == 0 means asynchronous notification */ + SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead); + switch (ohead.error) { + case FUSE_NOTIFY_INVAL_ENTRY: + err = fuse_internal_invalidate_entry(mp, uio); + break; + case FUSE_NOTIFY_INVAL_INODE: + err = fuse_internal_invalidate_inode(mp, uio); + break; + case FUSE_NOTIFY_RETRIEVE: + case FUSE_NOTIFY_STORE: + /* + * Unimplemented. I don't know of any file systems + * that use them, and the protocol isn't sound anyway, + * since the notification messages don't include the + * inode's generation number. Without that, it's + * possible to manipulate the cache of the wrong vnode. + * Finally, it's not defined what this message should + * do for a file with dirty cache. + */ + case FUSE_NOTIFY_POLL: + /* Unimplemented. See comments in fuse_vnops */ + default: + /* Not implemented */ + err = ENOSYS; + } } else { /* no callback at all! */ - SDT_PROBE2(fuse, , device, trace, 1, - "erhm, no handler for this response"); - err = EINVAL; + SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket, + ohead.unique); + if (ohead.error == -EAGAIN) { + /* + * This was probably a response to a FUSE_INTERRUPT + * operation whose original operation is already + * complete. We can't store FUSE_INTERRUPT tickets + * indefinitely because their responses are optional. + * So we delete them when the original operation + * completes. And sadly the fuse_header_out doesn't + * identify the opcode, so we have to guess. + */ + err = 0; + } else { + err = EINVAL; + } } return (err); @@ -445,7 +576,7 @@ { fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR, - S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP, "fuse"); + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, "fuse"); if (fuse_dev == NULL) return (ENOMEM); return (0); Index: sys/fs/fuse/fuse_file.h =================================================================== --- sys/fs/fuse/fuse_file.h +++ sys/fs/fuse/fuse_file.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -66,52 +71,115 @@ #include #include +/* + * The fufh type is the access mode of the fuse file handle. It's the portion + * of the open(2) flags related to permission. + */ typedef enum fufh_type { FUFH_INVALID = -1, - FUFH_RDONLY = 0, - FUFH_WRONLY = 1, - FUFH_RDWR = 2, - FUFH_MAXTYPE = 3, + FUFH_RDONLY = O_RDONLY, + FUFH_WRONLY = O_WRONLY, + FUFH_RDWR = O_RDWR, + FUFH_EXEC = O_EXEC, } fufh_type_t; -_Static_assert(FUFH_RDONLY == O_RDONLY, "RDONLY"); -_Static_assert(FUFH_WRONLY == O_WRONLY, "WRONLY"); -_Static_assert(FUFH_RDWR == O_RDWR, "RDWR"); +/* + * FUSE File Handles + * + * The FUSE protocol says that a server may assign a unique 64-bit file handle + * every time that a file is opened. Effectively, that's once for each file + * descriptor. + * + * Unfortunately, the VFS doesn't help us here. VOPs don't have a + * struct file* argument. fileops do, but many syscalls bypass the fileops + * layer and go straight to a vnode. Some, like writing from cache, can't + * track a file handle even in theory. The entire concept of the file handle + * is a product of FUSE's Linux origins; Linux lacks vnodes and almost every + * file system operation takes a struct file* argument. + * + * Since FreeBSD's VFS is more file descriptor-agnostic, we must store FUSE + * filehandles in the vnode. One option would be to only store a single file + * handle and never open FUSE files concurrently. That's what NetBSD does. + * But that violates FUSE's security model. FUSE expects the server to do all + * authorization (except when mounted with -o default_permissions). In order + * to do that, the server needs us to send FUSE_OPEN every time somebody opens + * a new file descriptor. + * + * Another option would be to never open FUSE files concurrently, but send a + * FUSE_ACCESS prior to every open after the first. That would give the server + * the opportunity to authorize the access. Unfortunately, the FUSE protocol + * makes ACCESS optional. File systems that don't implement it are assumed to + * authorize everything. A survey of 32 fuse file systems showed that only 14 + * implemented access. Among the laggards were a few that really ought to be + * doing server-side authorization. + * + * So we do something hacky, similar to what OpenBSD, Illumos, and OSXFuse do. + * we store a list of file handles, one for each combination of vnode, uid, + * gid, pid, and access mode. When opening a file, we first check whether + * there's already a matching file handle. If so, we reuse it. If not, we + * send FUSE_OPEN and create a new file handle. That minimizes the number of + * open file handles while still allowing the server to authorize stuff. + * + * VOPs that need a file handle search through the list for a close match. + * They can't be guaranteed of finding an exact match because, for example, a + * process may have changed its UID since opening the file. Also, most VOPs + * don't know exactly what permission they need. Is O_RDWR required or is + * O_RDONLY good enough? So the file handle we end up using may not be exactly + * the one we're supposed to use with that file descriptor. But if the FUSE + * file system isn't too picky, it will work. (FWIW even Linux sometimes + * guesses the file handle, during writes from cache or most SETATTR + * operations). + * + * I suspect this mess is part of the reason why neither NFS nor 9P have an + * equivalent of FUSE file handles. + */ struct fuse_filehandle { + LIST_ENTRY(fuse_filehandle) next; + + /* The filehandle returned by FUSE_OPEN */ uint64_t fh_id; - fufh_type_t fh_type; -}; -#define FUFH_IS_VALID(f) ((f)->fh_type != FUFH_INVALID) + /* + * flags returned by FUSE_OPEN + * Supported flags: FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE + * Unsupported: + * FOPEN_NONSEEKABLE: Adding support would require a new per-file + * or per-vnode attribute, which would have to be checked by + * kern_lseek (and others) for every file system. The benefit is + * dubious, since I'm unaware of any file systems in ports that use + * this flag. + */ + uint32_t fuse_open_flags; -static inline fufh_type_t -fuse_filehandle_xlate_from_mmap(int fflags) -{ - if (fflags & (PROT_READ | PROT_WRITE)) - return FUFH_RDWR; - else if (fflags & (PROT_WRITE)) - return FUFH_WRONLY; - else if ((fflags & PROT_READ) || (fflags & PROT_EXEC)) - return FUFH_RDONLY; - else - return FUFH_INVALID; -} + /* The access mode of the file handle */ + fufh_type_t fufh_type; -static inline fufh_type_t -fuse_filehandle_xlate_from_fflags(int fflags) -{ - if ((fflags & FREAD) && (fflags & FWRITE)) - return FUFH_RDWR; - else if (fflags & (FWRITE)) - return FUFH_WRONLY; - else if (fflags & (FREAD)) - return FUFH_RDONLY; - else - panic("FUSE: What kind of a flag is this (%x)?", fflags); -} + /* Credentials used to open the file */ + gid_t gid; + pid_t pid; + uid_t uid; +}; +#define FUFH_IS_VALID(f) ((f)->fufh_type != FUFH_INVALID) + +/* + * Get the flags to use for FUSE_CREATE, FUSE_OPEN and FUSE_RELEASE + * + * These are supposed to be the same as the flags argument to open(2). + * However, since we can't reliably associate a fuse_filehandle with a specific + * file descriptor it would would be dangerous to include anything more than + * the access mode flags. For example, suppose we open a file twice, once with + * O_APPEND and once without. Then the user pwrite(2)s to offset using the + * second file descriptor. If fusefs uses the first file handle, then the + * server may append the write to the end of the file rather than at offset 0. + * To prevent problems like this, we only ever send the portion of flags + * related to access mode. + * + * It's essential to send that portion, because FUSE uses it for server-side + * authorization. + */ static inline int -fuse_filehandle_xlate_to_oflags(fufh_type_t type) +fufh_type_2_fflags(fufh_type_t type) { int oflags = -1; @@ -119,6 +187,7 @@ case FUFH_RDONLY: case FUFH_WRONLY: case FUFH_RDWR: + case FUFH_EXEC: oflags = type; break; default: @@ -128,19 +197,28 @@ return oflags; } -int fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type); -fufh_type_t fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type); -int fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp); -int fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp); +bool fuse_filehandle_validrw(struct vnode *vp, int mode, + struct ucred *cred, pid_t pid); +int fuse_filehandle_get(struct vnode *vp, int fflag, + struct fuse_filehandle **fufhp, struct ucred *cred, + pid_t pid); +int fuse_filehandle_get_anyflags(struct vnode *vp, + struct fuse_filehandle **fufhp, struct ucred *cred, + pid_t pid); +int fuse_filehandle_getrw(struct vnode *vp, int fflag, + struct fuse_filehandle **fufhp, struct ucred *cred, + pid_t pid); void fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp, uint64_t fh_id); -int fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type, + struct fuse_filehandle **fufhp, struct thread *td, + struct ucred *cred, struct fuse_open_out *foo); +int fuse_filehandle_open(struct vnode *vp, int mode, struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred); -int fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type, +int fuse_filehandle_close(struct vnode *vp, struct fuse_filehandle *fufh, struct thread *td, struct ucred *cred); + +void fuse_file_init(void); +void fuse_file_destroy(void); #endif /* _FUSE_FILE_H_ */ Index: sys/fs/fuse/fuse_file.c =================================================================== --- sys/fs/fuse/fuse_file.c +++ sys/fs/fuse/fuse_file.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -59,6 +64,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include #include @@ -79,52 +85,61 @@ #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" +#include "fuse_io.h" #include "fuse_ipc.h" #include "fuse_node.h" -SDT_PROVIDER_DECLARE(fuse); +MALLOC_DEFINE(M_FUSE_FILEHANDLE, "fuse_filefilehandle", "FUSE file handle"); + +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , file, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , file, trace, "int", "char*"); -static int fuse_fh_count = 0; +static counter_u64_t fuse_fh_count; -SYSCTL_INT(_vfs_fusefs, OID_AUTO, filehandle_count, CTLFLAG_RD, - &fuse_fh_count, 0, "number of open FUSE filehandles"); +SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, filehandle_count, CTLFLAG_RD, + &fuse_fh_count, "number of open FUSE filehandles"); +/* Get the FUFH type for a particular access mode */ +static inline fufh_type_t +fflags_2_fufh_type(int fflags) +{ + if ((fflags & FREAD) && (fflags & FWRITE)) + return FUFH_RDWR; + else if (fflags & (FWRITE)) + return FUFH_WRONLY; + else if (fflags & (FREAD)) + return FUFH_RDONLY; + else if (fflags & (FEXEC)) + return FUFH_EXEC; + else + panic("FUSE: What kind of a flag is this (%x)?", fflags); +} + int -fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type, +fuse_filehandle_open(struct vnode *vp, int a_mode, struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred) { struct fuse_dispatcher fdi; struct fuse_open_in *foi; struct fuse_open_out *foo; + fufh_type_t fufh_type; int err = 0; int oflags = 0; int op = FUSE_OPEN; - if (fuse_filehandle_valid(vp, fufh_type)) { - panic("FUSE: filehandle_open called despite valid fufh (type=%d)", - fufh_type); - /* NOTREACHED */ - } - /* - * Note that this means we are effectively FILTERING OUT open() flags. - */ - oflags = fuse_filehandle_xlate_to_oflags(fufh_type); + fufh_type = fflags_2_fufh_type(a_mode); + oflags = fufh_type_2_fflags(fufh_type); if (vnode_isdir(vp)) { op = FUSE_OPENDIR; - if (fufh_type != FUFH_RDONLY) { - SDT_PROBE2(fuse, , file, trace, 1, - "non-rdonly fh requested for a directory?"); - printf("FUSE:non-rdonly fh requested for a directory?\n"); - fufh_type = FUFH_RDONLY; - } + /* vn_open_vnode already rejects FWRITE on directories */ + MPASS(fufh_type == FUFH_RDONLY || fufh_type == FUFH_EXEC); } fdisp_init(&fdi, sizeof(*foi)); fdisp_make_vp(&fdi, op, vp, td, cred); @@ -133,7 +148,7 @@ foi->flags = oflags; if ((err = fdisp_wait_answ(&fdi))) { - SDT_PROBE2(fuse, , file, trace, 1, + SDT_PROBE2(fusefs, , file, trace, 1, "OUCH ... daemon didn't give fh"); if (err == ENOENT) { fuse_internal_vnode_disappear(vp); @@ -142,42 +157,24 @@ } foo = fdi.answ; - fuse_filehandle_init(vp, fufh_type, fufhp, foo->fh); + fuse_filehandle_init(vp, fufh_type, fufhp, td, cred, foo); + fuse_vnode_open(vp, foo->open_flags, td); - /* - * For WRONLY opens, force DIRECT_IO. This is necessary - * since writing a partial block through the buffer cache - * will result in a read of the block and that read won't - * be allowed by the WRONLY open. - */ - if (fufh_type == FUFH_WRONLY) - fuse_vnode_open(vp, foo->open_flags | FOPEN_DIRECT_IO, td); - else - fuse_vnode_open(vp, foo->open_flags, td); - out: fdisp_destroy(&fdi); return err; } int -fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type, +fuse_filehandle_close(struct vnode *vp, struct fuse_filehandle *fufh, struct thread *td, struct ucred *cred) { struct fuse_dispatcher fdi; struct fuse_release_in *fri; - struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct fuse_filehandle *fufh = NULL; int err = 0; int op = FUSE_RELEASE; - fufh = &(fvdat->fufh[fufh_type]); - if (!FUFH_IS_VALID(fufh)) { - panic("FUSE: filehandle_put called on invalid fufh (type=%d)", - fufh_type); - /* NOTREACHED */ - } if (fuse_isdeadfs(vp)) { goto out; } @@ -187,96 +184,194 @@ fdisp_make_vp(&fdi, op, vp, td, cred); fri = fdi.indata; fri->fh = fufh->fh_id; - fri->flags = fuse_filehandle_xlate_to_oflags(fufh_type); + fri->flags = fufh_type_2_fflags(fufh->fufh_type); + /* + * If the file has a POSIX lock then we're supposed to set lock_owner. + * If not, then lock_owner is undefined. So we may as well always set + * it. + */ + fri->lock_owner = td->td_proc->p_pid; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); out: - atomic_subtract_acq_int(&fuse_fh_count, 1); - fufh->fh_id = (uint64_t)-1; - fufh->fh_type = FUFH_INVALID; + counter_u64_add(fuse_fh_count, -1); + LIST_REMOVE(fufh, next); + free(fufh, M_FUSE_FILEHANDLE); return err; } -int -fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type) -{ - struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct fuse_filehandle *fufh; - - fufh = &(fvdat->fufh[fufh_type]); - return FUFH_IS_VALID(fufh); -} - /* * Check for a valid file handle, first the type requested, but if that * isn't valid, try for FUFH_RDWR. - * Return the FUFH type that is valid or FUFH_INVALID if there are none. - * This is a variant of fuse_filehandle_vaild() analogous to - * fuse_filehandle_getrw(). + * Return true if there is any file handle with the correct credentials and + * a fufh type that includes the provided one. + * A pid of 0 means "don't care" */ -fufh_type_t -fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type) +bool +fuse_filehandle_validrw(struct vnode *vp, int mode, + struct ucred *cred, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; + fufh_type_t fufh_type = fflags_2_fufh_type(mode); - fufh = &fvdat->fufh[fufh_type]; - if (FUFH_IS_VALID(fufh) != 0) - return (fufh_type); - fufh = &fvdat->fufh[FUFH_RDWR]; - if (FUFH_IS_VALID(fufh) != 0) - return (FUFH_RDWR); - return (FUFH_INVALID); + /* + * Unlike fuse_filehandle_get, we want to search for a filehandle with + * the exact cred, and no fallback + */ + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (fufh->fufh_type == fufh_type && + fufh->uid == cred->cr_uid && + fufh->gid == cred->cr_rgid && + (pid == 0 || fufh->pid == pid)) + return true; + } + + if (fufh_type == FUFH_EXEC) + return false; + + /* Fallback: find a RDWR list entry with the right cred */ + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (fufh->fufh_type == FUFH_RDWR && + fufh->uid == cred->cr_uid && + fufh->gid == cred->cr_rgid && + (pid == 0 || fufh->pid == pid)) + return true; + } + + return false; } int -fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp) +fuse_filehandle_get(struct vnode *vp, int fflag, + struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; + fufh_type_t fufh_type; - fufh = &(fvdat->fufh[fufh_type]); - if (!FUFH_IS_VALID(fufh)) + fufh_type = fflags_2_fufh_type(fflag); + /* cred can be NULL for in-kernel clients */ + if (cred == NULL) + goto fallback; + + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (fufh->fufh_type == fufh_type && + fufh->uid == cred->cr_uid && + fufh->gid == cred->cr_rgid && + (pid == 0 || fufh->pid == pid)) + goto found; + } + +fallback: + /* Fallback: find a list entry with the right flags */ + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (fufh->fufh_type == fufh_type) + break; + } + + if (fufh == NULL) return EBADF; + +found: if (fufhp != NULL) *fufhp = fufh; return 0; } +/* Get a file handle with any kind of flags */ int -fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp) +fuse_filehandle_get_anyflags(struct vnode *vp, + struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; - fufh = &(fvdat->fufh[fufh_type]); - if (!FUFH_IS_VALID(fufh)) { - fufh_type = FUFH_RDWR; + if (cred == NULL) + goto fallback; + + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (fufh->uid == cred->cr_uid && + fufh->gid == cred->cr_rgid && + (pid == 0 || fufh->pid == pid)) + goto found; } - return fuse_filehandle_get(vp, fufh_type, fufhp); + +fallback: + /* Fallback: find any list entry */ + fufh = LIST_FIRST(&fvdat->handles); + + if (fufh == NULL) + return EBADF; + +found: + if (fufhp != NULL) + *fufhp = fufh; + return 0; } +int +fuse_filehandle_getrw(struct vnode *vp, int fflag, + struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid) +{ + int err; + + err = fuse_filehandle_get(vp, fflag, fufhp, cred, pid); + if (err) + err = fuse_filehandle_get(vp, FREAD | FWRITE, fufhp, cred, pid); + return err; +} + void fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp, uint64_t fh_id) + struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred, + struct fuse_open_out *foo) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; - fufh = &(fvdat->fufh[fufh_type]); - MPASS(!FUFH_IS_VALID(fufh)); - fufh->fh_id = fh_id; - fufh->fh_type = fufh_type; + fufh = malloc(sizeof(struct fuse_filehandle), M_FUSE_FILEHANDLE, + M_WAITOK); + MPASS(fufh != NULL); + fufh->fh_id = foo->fh; + fufh->fufh_type = fufh_type; + fufh->gid = cred->cr_rgid; + fufh->uid = cred->cr_uid; + fufh->pid = td->td_proc->p_pid; + fufh->fuse_open_flags = foo->open_flags; if (!FUFH_IS_VALID(fufh)) { panic("FUSE: init: invalid filehandle id (type=%d)", fufh_type); } + LIST_INSERT_HEAD(&fvdat->handles, fufh, next); if (fufhp != NULL) *fufhp = fufh; - atomic_add_acq_int(&fuse_fh_count, 1); + counter_u64_add(fuse_fh_count, 1); + + if (foo->open_flags & FOPEN_DIRECT_IO) { + ASSERT_VOP_ELOCKED(vp, __func__); + VTOFUD(vp)->flag |= FN_DIRECTIO; + fuse_io_invalbuf(vp, td); + } else { + if ((foo->open_flags & FOPEN_KEEP_CACHE) == 0) + fuse_io_invalbuf(vp, td); + VTOFUD(vp)->flag &= ~FN_DIRECTIO; + } + +} + +void +fuse_file_init(void) +{ + fuse_fh_count = counter_u64_alloc(M_WAITOK); + counter_u64_zero(fuse_fh_count); +} + +void +fuse_file_destroy(void) +{ + counter_u64_free(fuse_fh_count); } Index: sys/fs/fuse/fuse_internal.h =================================================================== --- sys/fs/fuse/fuse_internal.h +++ sys/fs/fuse/fuse_internal.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -61,6 +66,7 @@ #define _FUSE_INTERNAL_H_ #include +#include #include #include #include @@ -68,6 +74,9 @@ #include "fuse_ipc.h" #include "fuse_node.h" +extern counter_u64_t fuse_lookup_cache_hits; +extern counter_u64_t fuse_lookup_cache_misses; + static inline bool vfs_isrdonly(struct mount *mp) { @@ -80,12 +89,6 @@ return (vp->v_mount); } -static inline bool -vnode_mountedhere(struct vnode *vp) -{ - return (vp->v_mountedhere != NULL); -} - static inline enum vtype vnode_vtype(struct vnode *vp) { @@ -134,12 +137,6 @@ uio->uio_offset = offset; } -static inline void -uio_setresid(struct uio *uio, ssize_t resid) -{ - uio->uio_resid = resid; -} - /* miscellaneous */ static inline bool @@ -156,25 +153,57 @@ return (vp->v_mount->mnt_stat.f_iosize); } -/* access */ +/* + * Make a cacheable timeout in bintime format value based on a fuse_attr_out + * response + */ +static inline void +fuse_validity_2_bintime(uint64_t attr_valid, uint32_t attr_valid_nsec, + struct bintime *timeout) +{ + struct timespec now, duration, timeout_ts; -#define FVP_ACCESS_NOOP 0x01 + getnanouptime(&now); + /* "+ 2" is the bound of attr_valid_nsec + now.tv_nsec */ + /* Why oh why isn't there a TIME_MAX defined? */ + if (attr_valid >= INT_MAX || attr_valid + now.tv_sec + 2 >= INT_MAX) { + timeout->sec = INT_MAX; + } else { + duration.tv_sec = attr_valid; + duration.tv_nsec = attr_valid_nsec; + timespecadd(&duration, &now, &timeout_ts); + timespec2bintime(&timeout_ts, timeout); + } +} -#define FACCESS_VA_VALID 0x01 -#define FACCESS_DO_ACCESS 0x02 -#define FACCESS_STICKY 0x04 -#define FACCESS_CHOWN 0x08 -#define FACCESS_NOCHECKSPY 0x10 -#define FACCESS_SETGID 0x12 +/* + * Make a cacheable timeout value in timespec format based on the fuse_entry_out + * response + */ +static inline void +fuse_validity_2_timespec(const struct fuse_entry_out *feo, + struct timespec *timeout) +{ + struct timespec duration, now; -#define FACCESS_XQUERIES (FACCESS_STICKY | FACCESS_CHOWN | FACCESS_SETGID) + getnanouptime(&now); + /* "+ 2" is the bound of entry_valid_nsec + now.tv_nsec */ + if (feo->entry_valid >= INT_MAX || + feo->entry_valid + now.tv_sec + 2 >= INT_MAX) { + timeout->tv_sec = INT_MAX; + } else { + duration.tv_sec = feo->entry_valid; + duration.tv_nsec = feo->entry_valid_nsec; + timespecadd(&duration, &now, timeout); + } +} -struct fuse_access_param { - uid_t xuid; - gid_t xgid; - uint32_t facc_flags; -}; +/* VFS ops */ +int +fuse_internal_get_cached_vnode(struct mount*, ino_t, int, struct vnode**); + +/* access */ static inline int fuse_match_cred(struct ucred *basecred, struct ucred *usercred) { @@ -189,8 +218,8 @@ return (EPERM); } -int fuse_internal_access(struct vnode *vp, mode_t mode, - struct fuse_access_param *facp, struct thread *td, struct ucred *cred); +int fuse_internal_access(struct vnode *vp, accmode_t mode, + struct thread *td, struct ucred *cred); /* attributes */ void fuse_internal_cache_attrs(struct vnode *vp, struct fuse_attr *attr, @@ -198,21 +227,35 @@ /* fsync */ -int fuse_internal_fsync(struct vnode *vp, struct thread *td, - struct ucred *cred, struct fuse_filehandle *fufh); +int fuse_internal_fsync(struct vnode *vp, struct thread *td, int waitfor, + bool datasync); int fuse_internal_fsync_callback(struct fuse_ticket *tick, struct uio *uio); -/* readdir */ +/* getattr */ +int fuse_internal_do_getattr(struct vnode *vp, struct vattr *vap, + struct ucred *cred, struct thread *td); +int fuse_internal_getattr(struct vnode *vp, struct vattr *vap, + struct ucred *cred, struct thread *td); +/* asynchronous invalidation */ +int fuse_internal_invalidate_entry(struct mount *mp, struct uio *uio); +int fuse_internal_invalidate_inode(struct mount *mp, struct uio *uio); + +/* mknod */ +int fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp, struct vattr *vap); + +/* readdir */ struct pseudo_dirent { uint32_t d_namlen; }; +int fuse_internal_readdir(struct vnode *vp, struct uio *uio, off_t startoff, + struct fuse_filehandle *fufh, struct fuse_iov *cookediov, int *ncookies, + u_long *cookies); +int fuse_internal_readdir_processdata(struct uio *uio, off_t startoff, + int *fnd_start, size_t reqsize, void *buf, size_t bufsize, + struct fuse_iov *cookediov, int *ncookies, u_long **cookiesp); -int fuse_internal_readdir(struct vnode *vp, struct uio *uio, - struct fuse_filehandle *fufh, struct fuse_iov *cookediov); -int fuse_internal_readdir_processdata(struct uio *uio, size_t reqsize, - void *buf, size_t bufsize, void *param); - /* remove */ int fuse_internal_remove(struct vnode *dvp, struct vnode *vp, @@ -227,6 +270,10 @@ void fuse_internal_vnode_disappear(struct vnode *vp); +/* setattr */ +int fuse_internal_setattr(struct vnode *vp, struct vattr *va, + struct thread *td, struct ucred *cred); + /* strategy */ /* entity creation */ @@ -270,5 +317,9 @@ int fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio); void fuse_internal_send_init(struct fuse_data *data, struct thread *td); + +/* module load/unload */ +void fuse_internal_init(void); +void fuse_internal_destroy(void); #endif /* _FUSE_INTERNAL_H_ */ Index: sys/fs/fuse/fuse_internal.c =================================================================== --- sys/fs/fuse/fuse_internal.c +++ sys/fs/fuse/fuse_internal.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -59,6 +64,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include #include @@ -89,35 +95,78 @@ #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" +#include "fuse_io.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_file.h" -#include "fuse_param.h" -SDT_PROVIDER_DECLARE(fuse); +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , internal, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , internal, trace, "int", "char*"); #ifdef ZERO_PAD_INCOMPLETE_BUFS static int isbzero(void *buf, size_t len); #endif -/* access */ +counter_u64_t fuse_lookup_cache_hits; +counter_u64_t fuse_lookup_cache_misses; +SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, lookup_cache_hits, CTLFLAG_RD, + &fuse_lookup_cache_hits, "number of positive cache hits in lookup"); + +SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, lookup_cache_misses, CTLFLAG_RD, + &fuse_lookup_cache_misses, "number of cache misses in lookup"); + int +fuse_internal_get_cached_vnode(struct mount* mp, ino_t ino, int flags, + struct vnode **vpp) +{ + struct bintime now; + struct thread *td = curthread; + uint64_t nodeid = ino; + int error; + + *vpp = NULL; + + error = vfs_hash_get(mp, fuse_vnode_hash(nodeid), flags, td, vpp, + fuse_vnode_cmp, &nodeid); + if (error) + return error; + /* + * Check the entry cache timeout. We have to do this within fusefs + * instead of by using cache_enter_time/cache_lookup because those + * routines are only intended to work with pathnames, not inodes + */ + if (*vpp != NULL) { + getbinuptime(&now); + if (bintime_cmp(&(VTOFUD(*vpp)->entry_cache_timeout), &now, >)){ + counter_u64_add(fuse_lookup_cache_hits, 1); + return 0; + } else { + /* Entry cache timeout */ + counter_u64_add(fuse_lookup_cache_misses, 1); + cache_purge(*vpp); + vput(*vpp); + *vpp = NULL; + } + } + return 0; +} + +/* Synchronously send a FUSE_ACCESS operation */ +int fuse_internal_access(struct vnode *vp, - mode_t mode, - struct fuse_access_param *facp, + accmode_t mode, struct thread *td, struct ucred *cred) { int err = 0; - uint32_t mask = 0; + uint32_t mask = F_OK; int dataflags; int vtype; struct mount *mp; @@ -125,77 +174,57 @@ struct fuse_access_in *fai; struct fuse_data *data; - /* NOT YET DONE */ - /* - * If this vnop gives you trouble, just return 0 here for a lazy - * kludge. - */ - /* return 0;*/ - mp = vnode_mount(vp); vtype = vnode_vtype(vp); data = fuse_get_mpdata(mp); dataflags = data->dataflags; - if ((mode & VWRITE) && vfs_isrdonly(mp)) { - return EACCES; - } - /* Unless explicitly permitted, deny everyone except the fs owner. */ - if (vnode_isvroot(vp) && !(facp->facc_flags & FACCESS_NOCHECKSPY)) { - if (!(dataflags & FSESS_DAEMON_CAN_SPY)) { - int denied = fuse_match_cred(data->daemoncred, - cred); + if (mode == 0) + return 0; - if (denied) { - return EPERM; - } + if (mode & VMODIFY_PERMS && vfs_isrdonly(mp)) { + switch (vp->v_type) { + case VDIR: + /* FALLTHROUGH */ + case VLNK: + /* FALLTHROUGH */ + case VREG: + return EROFS; + default: + break; } - facp->facc_flags |= FACCESS_NOCHECKSPY; } - if (!(facp->facc_flags & FACCESS_DO_ACCESS)) { - return 0; + + /* Unless explicitly permitted, deny everyone except the fs owner. */ + if (!(dataflags & FSESS_DAEMON_CAN_SPY)) { + if (fuse_match_cred(data->daemoncred, cred)) + return EPERM; } - if (((vtype == VREG) && (mode & VEXEC))) { -#ifdef NEED_MOUNT_ARGUMENT_FOR_THIS - /* Let the kernel handle this through open / close heuristics.*/ - return ENOTSUP; -#else - /* Let the kernel handle this. */ - return 0; -#endif - } - if (!fsess_isimpl(mp, FUSE_ACCESS)) { - /* Let the kernel handle this. */ - return 0; - } + if (dataflags & FSESS_DEFAULT_PERMISSIONS) { - /* Let the kernel handle this. */ - return 0; + struct vattr va; + + fuse_internal_getattr(vp, &va, cred, td); + return vaccess(vp->v_type, va.va_mode, va.va_uid, + va.va_gid, mode, cred, NULL); } - if ((mode & VADMIN) != 0) { - err = priv_check_cred(cred, PRIV_VFS_ADMIN); - if (err) { - return err; - } - } - if ((mode & (VWRITE | VAPPEND | VADMIN)) != 0) { + + if (!fsess_isimpl(mp, FUSE_ACCESS)) + return 0; + + if ((mode & (VWRITE | VAPPEND | VADMIN)) != 0) mask |= W_OK; - } - if ((mode & VREAD) != 0) { + if ((mode & VREAD) != 0) mask |= R_OK; - } - if ((mode & VEXEC) != 0) { + if ((mode & VEXEC) != 0) mask |= X_OK; - } - bzero(&fdi, sizeof(fdi)); fdisp_init(&fdi, sizeof(*fai)); fdisp_make_vp(&fdi, FUSE_ACCESS, vp, td, cred); fai = fdi.indata; - fai->mask = F_OK; - fai->mask |= mask; + fai->mask = mask; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); @@ -208,9 +237,9 @@ } /* - * Cache FUSE attributes from feo, in attr cache associated with vnode 'vp'. - * Optionally, if argument 'vap' is not NULL, store a copy of the converted - * attributes there as well. + * Cache FUSE attributes from attr, in attribute cache associated with vnode + * 'vp'. Optionally, if argument 'vap' is not NULL, store a copy of the + * converted attributes there as well. * * If the nominal attribute cache TTL is zero, do not cache on the 'vp' (but do * return the result to the caller). @@ -221,49 +250,57 @@ { struct mount *mp; struct fuse_vnode_data *fvdat; + struct fuse_data *data; struct vattr *vp_cache_at; mp = vnode_mount(vp); fvdat = VTOFUD(vp); + data = fuse_get_mpdata(mp); - /* Honor explicit do-not-cache requests from user filesystems. */ - if (attr_valid == 0 && attr_valid_nsec == 0) - fvdat->valid_attr_cache = false; - else - fvdat->valid_attr_cache = true; + ASSERT_VOP_ELOCKED(vp, "fuse_internal_cache_attrs"); - vp_cache_at = VTOVA(vp); + fuse_validity_2_bintime(attr_valid, attr_valid_nsec, + &fvdat->attr_cache_timeout); - if (vap == NULL && vp_cache_at == NULL) + /* Fix our buffers if the filesize changed without us knowing */ + if (vnode_isreg(vp) && attr->size != fvdat->cached_attrs.va_size) { + (void)fuse_vnode_setsize(vp, attr->size); + fvdat->cached_attrs.va_size = attr->size; + } + + if (attr_valid > 0 || attr_valid_nsec > 0) + vp_cache_at = &(fvdat->cached_attrs); + else if (vap != NULL) + vp_cache_at = vap; + else return; - if (vap == NULL) - vap = vp_cache_at; - - vattr_null(vap); - - vap->va_fsid = mp->mnt_stat.f_fsid.val[0]; - vap->va_fileid = attr->ino; - vap->va_mode = attr->mode & ~S_IFMT; - vap->va_nlink = attr->nlink; - vap->va_uid = attr->uid; - vap->va_gid = attr->gid; - vap->va_rdev = attr->rdev; - vap->va_size = attr->size; + vattr_null(vp_cache_at); + vp_cache_at->va_fsid = mp->mnt_stat.f_fsid.val[0]; + vp_cache_at->va_fileid = attr->ino; + vp_cache_at->va_mode = attr->mode & ~S_IFMT; + vp_cache_at->va_nlink = attr->nlink; + vp_cache_at->va_uid = attr->uid; + vp_cache_at->va_gid = attr->gid; + vp_cache_at->va_rdev = attr->rdev; + vp_cache_at->va_size = attr->size; /* XXX on i386, seconds are truncated to 32 bits */ - vap->va_atime.tv_sec = attr->atime; - vap->va_atime.tv_nsec = attr->atimensec; - vap->va_mtime.tv_sec = attr->mtime; - vap->va_mtime.tv_nsec = attr->mtimensec; - vap->va_ctime.tv_sec = attr->ctime; - vap->va_ctime.tv_nsec = attr->ctimensec; - vap->va_blocksize = PAGE_SIZE; - vap->va_type = IFTOVT(attr->mode); - vap->va_bytes = attr->blocks * S_BLKSIZE; - vap->va_flags = 0; + vp_cache_at->va_atime.tv_sec = attr->atime; + vp_cache_at->va_atime.tv_nsec = attr->atimensec; + vp_cache_at->va_mtime.tv_sec = attr->mtime; + vp_cache_at->va_mtime.tv_nsec = attr->mtimensec; + vp_cache_at->va_ctime.tv_sec = attr->ctime; + vp_cache_at->va_ctime.tv_nsec = attr->ctimensec; + if (fuse_libabi_geq(data, 7, 9) && attr->blksize > 0) + vp_cache_at->va_blocksize = attr->blksize; + else + vp_cache_at->va_blocksize = PAGE_SIZE; + vp_cache_at->va_type = IFTOVT(attr->mode); + vp_cache_at->va_bytes = attr->blocks * S_BLKSIZE; + vp_cache_at->va_flags = 0; - if (vap != vp_cache_at && vp_cache_at != NULL) - memcpy(vp_cache_at, vap, sizeof(*vap)); + if (vap != vp_cache_at && vap != NULL) + memcpy(vap, vp_cache_at, sizeof(*vap)); } @@ -281,47 +318,195 @@ int fuse_internal_fsync(struct vnode *vp, struct thread *td, - struct ucred *cred, - struct fuse_filehandle *fufh) + int waitfor, + bool datasync) { - int op = FUSE_FSYNC; - struct fuse_fsync_in *ffsi; + struct fuse_fsync_in *ffsi = NULL; struct fuse_dispatcher fdi; + struct fuse_filehandle *fufh; + struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct mount *mp = vnode_mount(vp); + int op = FUSE_FSYNC; + int err = 0; - if (vnode_isdir(vp)) { - op = FUSE_FSYNCDIR; + if (!fsess_isimpl(vnode_mount(vp), + (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) { + return 0; } - fdisp_init(&fdi, sizeof(*ffsi)); - fdisp_make_vp(&fdi, op, vp, td, cred); - ffsi = fdi.indata; - ffsi->fh = fufh->fh_id; + if (vnode_isdir(vp)) + op = FUSE_FSYNCDIR; - ffsi->fsync_flags = 1; /* datasync */ + if (!fsess_isimpl(mp, op)) + return 0; - fuse_insert_callback(fdi.tick, fuse_internal_fsync_callback); - fuse_insert_message(fdi.tick); + fdisp_init(&fdi, sizeof(*ffsi)); + /* + * fsync every open file handle for this file, because we can't be sure + * which file handle the caller is really referring to. + */ + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (ffsi == NULL) + fdisp_make_vp(&fdi, op, vp, td, NULL); + else + fdisp_refresh_vp(&fdi, op, vp, td, NULL); + ffsi = fdi.indata; + ffsi->fh = fufh->fh_id; + ffsi->fsync_flags = 0; + if (datasync) + ffsi->fsync_flags = 1; + + if (waitfor == MNT_WAIT) { + err = fdisp_wait_answ(&fdi); + } else { + fuse_insert_callback(fdi.tick, + fuse_internal_fsync_callback); + fuse_insert_message(fdi.tick, false); + } + if (err == ENOSYS) { + /* ENOSYS means "success, and don't call again" */ + fsess_set_notimpl(mp, op); + err = 0; + break; + } + } fdisp_destroy(&fdi); - return 0; + return err; +} +/* Asynchronous invalidation */ +SDT_PROBE_DEFINE2(fusefs, , internal, invalidate_cache_hit, + "struct vnode*", "struct vnode*"); +int +fuse_internal_invalidate_entry(struct mount *mp, struct uio *uio) +{ + struct fuse_notify_inval_entry_out fnieo; + struct componentname cn; + struct vnode *dvp, *vp; + char name[PATH_MAX]; + int err; + + if ((err = uiomove(&fnieo, sizeof(fnieo), uio)) != 0) + return (err); + + if ((err = uiomove(name, fnieo.namelen, uio)) != 0) + return (err); + name[fnieo.namelen] = '\0'; + /* fusefs does not cache "." or ".." entries */ + if (strncmp(name, ".", sizeof(".")) == 0 || + strncmp(name, "..", sizeof("..")) == 0) + return (0); + + if (fnieo.parent == FUSE_ROOT_ID) + err = VFS_ROOT(mp, LK_SHARED, &dvp); + else + err = fuse_internal_get_cached_vnode( mp, fnieo.parent, + LK_SHARED, &dvp); + /* + * If dvp is not in the cache, then it must've been reclaimed. And + * since fuse_vnop_reclaim does a cache_purge, name's entry must've + * been invalidated already. So we can safely return if dvp == NULL + */ + if (err != 0 || dvp == NULL) + return (err); + /* + * XXX we can't check dvp's generation because the FUSE invalidate + * entry message doesn't include it. Worse case is that we invalidate + * an entry that didn't need to be invalidated. + */ + + cn.cn_nameiop = LOOKUP; + cn.cn_flags = 0; /* !MAKEENTRY means free cached entry */ + cn.cn_thread = curthread; + cn.cn_cred = curthread->td_ucred; + cn.cn_lkflags = LK_SHARED; + cn.cn_pnbuf = NULL; + cn.cn_nameptr = name; + cn.cn_namelen = fnieo.namelen; + err = cache_lookup(dvp, &vp, &cn, NULL, NULL); + MPASS(err == 0); + fuse_vnode_clear_attr_cache(dvp); + vput(dvp); + return (0); } +int +fuse_internal_invalidate_inode(struct mount *mp, struct uio *uio) +{ + struct fuse_notify_inval_inode_out fniio; + struct vnode *vp; + int err; + + if ((err = uiomove(&fniio, sizeof(fniio), uio)) != 0) + return (err); + + if (fniio.ino == FUSE_ROOT_ID) + err = VFS_ROOT(mp, LK_EXCLUSIVE, &vp); + else + err = fuse_internal_get_cached_vnode(mp, fniio.ino, LK_SHARED, + &vp); + if (err != 0 || vp == NULL) + return (err); + /* + * XXX we can't check vp's generation because the FUSE invalidate + * entry message doesn't include it. Worse case is that we invalidate + * an inode that didn't need to be invalidated. + */ + + /* + * Flush and invalidate buffers if off >= 0. Technically we only need + * to flush and invalidate the range of offsets [off, off + len), but + * for simplicity's sake we do everything. + */ + if (fniio.off >= 0) + fuse_io_invalbuf(vp, curthread); + fuse_vnode_clear_attr_cache(vp); + vput(vp); + return (0); +} + +/* mknod */ +int +fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp, struct vattr *vap) +{ + struct fuse_data *data; + struct fuse_mknod_in fmni; + size_t insize; + + data = fuse_get_mpdata(dvp->v_mount); + + fmni.mode = MAKEIMODE(vap->va_type, vap->va_mode); + fmni.rdev = vap->va_rdev; + if (fuse_libabi_geq(data, 7, 12)) { + insize = sizeof(fmni); + fmni.umask = curthread->td_proc->p_fd->fd_cmask; + } else { + insize = FUSE_COMPAT_MKNOD_IN_SIZE; + } + return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKNOD, &fmni, + insize, vap->va_type)); +} + /* readdir */ int fuse_internal_readdir(struct vnode *vp, struct uio *uio, + off_t startoff, struct fuse_filehandle *fufh, - struct fuse_iov *cookediov) + struct fuse_iov *cookediov, + int *ncookies, + u_long *cookies) { int err = 0; struct fuse_dispatcher fdi; - struct fuse_read_in *fri; + struct fuse_read_in *fri = NULL; + int fnd_start; - if (uio_resid(uio) == 0) { + if (uio_resid(uio) == 0) return 0; - } fdisp_init(&fdi, 0); /* @@ -329,51 +514,70 @@ * I/O). */ + /* + * fnd_start is set non-zero once the offset in the directory gets + * to the startoff. This is done because directories must be read + * from the beginning (offset == 0) when fuse_vnop_readdir() needs + * to do an open of the directory. + * If it is not set non-zero here, it will be set non-zero in + * fuse_internal_readdir_processdata() when uio_offset == startoff. + */ + fnd_start = 0; + if (uio->uio_offset == startoff) + fnd_start = 1; while (uio_resid(uio) > 0) { - fdi.iosize = sizeof(*fri); - fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL); + if (fri == NULL) + fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL); + else + fdisp_refresh_vp(&fdi, FUSE_READDIR, vp, NULL, NULL); fri = fdi.indata; fri->fh = fufh->fh_id; fri->offset = uio_offset(uio); - fri->size = min(uio_resid(uio), FUSE_DEFAULT_IOSIZE); - /* mp->max_read */ + fri->size = MIN(uio->uio_resid, + fuse_get_mpdata(vp->v_mount)->max_read); - if ((err = fdisp_wait_answ(&fdi))) { + if ((err = fdisp_wait_answ(&fdi))) break; - } - if ((err = fuse_internal_readdir_processdata(uio, fri->size, fdi.answ, - fdi.iosize, cookediov))) { + if ((err = fuse_internal_readdir_processdata(uio, startoff, + &fnd_start, fri->size, fdi.answ, fdi.iosize, cookediov, + ncookies, &cookies))) break; - } } fdisp_destroy(&fdi); return ((err == -1) ? 0 : err); } +/* + * Return -1 to indicate that this readdir is finished, 0 if it copied + * all the directory data read in and it may be possible to read more + * and greater than 0 for a failure. + */ int fuse_internal_readdir_processdata(struct uio *uio, + off_t startoff, + int *fnd_start, size_t reqsize, void *buf, size_t bufsize, - void *param) + struct fuse_iov *cookediov, + int *ncookies, + u_long **cookiesp) { int err = 0; - int cou = 0; int bytesavail; size_t freclen; struct dirent *de; struct fuse_dirent *fudge; - struct fuse_iov *cookediov = param; + u_long *cookies; - if (bufsize < FUSE_NAME_OFFSET) { + cookies = *cookiesp; + if (bufsize < FUSE_NAME_OFFSET) return -1; - } for (;;) { - if (bufsize < FUSE_NAME_OFFSET) { err = -1; break; @@ -381,10 +585,12 @@ fudge = (struct fuse_dirent *)buf; freclen = FUSE_DIRENT_SIZE(fudge); - cou++; - if (bufsize < freclen) { - err = ((cou == 1) ? -1 : 0); + /* + * This indicates a partial directory entry at the + * end of the directory data. + */ + err = -1; break; } #ifdef ZERO_PAD_INCOMPLETE_BUFS @@ -402,30 +608,47 @@ &fudge->namelen); if (bytesavail > uio_resid(uio)) { + /* Out of space for the dir so we are done. */ err = -1; break; } - fiov_refresh(cookediov); - fiov_adjust(cookediov, bytesavail); + /* + * Don't start to copy the directory entries out until + * the requested offset in the directory is found. + */ + if (*fnd_start != 0) { + fiov_adjust(cookediov, bytesavail); + bzero(cookediov->base, bytesavail); - de = (struct dirent *)cookediov->base; - de->d_fileno = fudge->ino; - de->d_reclen = bytesavail; - de->d_type = fudge->type; - de->d_namlen = fudge->namelen; - memcpy((char *)cookediov->base + sizeof(struct dirent) - - MAXNAMLEN - 1, - (char *)buf + FUSE_NAME_OFFSET, fudge->namelen); - dirent_terminate(de); + de = (struct dirent *)cookediov->base; + de->d_fileno = fudge->ino; + de->d_reclen = bytesavail; + de->d_type = fudge->type; + de->d_namlen = fudge->namelen; + memcpy((char *)cookediov->base + sizeof(struct dirent) - + MAXNAMLEN - 1, + (char *)buf + FUSE_NAME_OFFSET, fudge->namelen); + dirent_terminate(de); - err = uiomove(cookediov->base, cookediov->len, uio); - if (err) { - break; - } + err = uiomove(cookediov->base, cookediov->len, uio); + if (err) + break; + if (cookies != NULL) { + if (*ncookies == 0) { + err = -1; + break; + } + *cookies = fudge->off; + cookies++; + (*ncookies)--; + } + } else if (startoff == fudge->off) + *fnd_start = 1; buf = (char *)buf + freclen; bufsize -= freclen; uio_setoffset(uio, fudge->off); } + *cookiesp = cookies; return err; } @@ -439,12 +662,9 @@ enum fuse_opcode op) { struct fuse_dispatcher fdi; - struct fuse_vnode_data *fvdat; - int err; + nlink_t nlink; + int err = 0; - err = 0; - fvdat = VTOFUD(vp); - fdisp_init(&fdi, cnp->cn_namelen + 1); fdisp_make_vp(&fdi, op, dvp, cnp->cn_thread, cnp->cn_cred); @@ -453,6 +673,35 @@ err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); + + if (err) + return (err); + + /* + * Access the cached nlink even if the attr cached has expired. If + * it's inaccurate, the worst that will happen is: + * 1) We'll recycle the vnode even though the file has another link we + * don't know about, costing a bit of cpu time, or + * 2) We won't recycle the vnode even though all of its links are gone. + * It will linger around until vnlru reclaims it, costing a bit of + * temporary memory. + */ + nlink = VTOFUD(vp)->cached_attrs.va_nlink--; + + /* + * Purge the parent's attribute cache because the daemon + * should've updated its mtime and ctime. + */ + fuse_vnode_clear_attr_cache(dvp); + + /* NB: nlink could be zero if it was never cached */ + if (nlink <= 1 || vnode_vtype(vp) == VDIR) { + fuse_internal_vnode_disappear(vp); + } else { + cache_purge(vp); + fuse_vnode_update(vp, FN_CTIMECHANGE); + } + return err; } @@ -532,6 +781,13 @@ feo->nodeid, 1); return err; } + + /* + * Purge the parent's attribute cache because the daemon should've + * updated its mtime and ctime + */ + fuse_vnode_clear_attr_cache(dvp); + fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); @@ -593,10 +849,79 @@ ffi = fdi.indata; ffi->nlookup = nlookup; - fuse_insert_message(fdi.tick); + fuse_insert_message(fdi.tick, false); fdisp_destroy(&fdi); } +/* Fetch the vnode's attributes from the daemon*/ +int +fuse_internal_do_getattr(struct vnode *vp, struct vattr *vap, + struct ucred *cred, struct thread *td) +{ + struct fuse_dispatcher fdi; + struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_getattr_in *fgai; + struct fuse_attr_out *fao; + off_t old_filesize = fvdat->cached_attrs.va_size; + struct timespec old_ctime = fvdat->cached_attrs.va_ctime; + struct timespec old_mtime = fvdat->cached_attrs.va_mtime; + enum vtype vtyp; + int err; + + fdisp_init(&fdi, 0); + fdisp_make_vp(&fdi, FUSE_GETATTR, vp, td, cred); + fgai = fdi.indata; + /* + * We could look up a file handle and set it in fgai->fh, but that + * involves extra runtime work and I'm unaware of any file systems that + * care. + */ + fgai->getattr_flags = 0; + if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) { + if (err == ENOENT) + fuse_internal_vnode_disappear(vp); + goto out; + } + + fao = (struct fuse_attr_out *)fdi.answ; + vtyp = IFTOVT(fao->attr.mode); + if (fvdat->flag & FN_SIZECHANGE) + fao->attr.size = old_filesize; + if (fvdat->flag & FN_CTIMECHANGE) { + fao->attr.ctime = old_ctime.tv_sec; + fao->attr.ctimensec = old_ctime.tv_nsec; + } + if (fvdat->flag & FN_MTIMECHANGE) { + fao->attr.mtime = old_mtime.tv_sec; + fao->attr.mtimensec = old_mtime.tv_nsec; + } + fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, + fao->attr_valid_nsec, vap); + if (vtyp != vnode_vtype(vp)) { + fuse_internal_vnode_disappear(vp); + err = ENOENT; + } + +out: + fdisp_destroy(&fdi); + return err; +} + +/* Read a vnode's attributes from cache or fetch them from the fuse daemon */ +int +fuse_internal_getattr(struct vnode *vp, struct vattr *vap, struct ucred *cred, + struct thread *td) +{ + struct vattr *attrs; + + if ((attrs = VTOVA(vp)) != NULL) { + *vap = *attrs; /* struct copy */ + return 0; + } + + return fuse_internal_do_getattr(vp, vap, cred, td); +} + void fuse_internal_vnode_disappear(struct vnode *vp) { @@ -604,7 +929,6 @@ ASSERT_VOP_ELOCKED(vp, "fuse_internal_vnode_disappear"); fvdat->flag |= FN_REVOKED; - fvdat->valid_attr_cache = false; cache_purge(vp); } @@ -625,27 +949,69 @@ } fiio = fticket_resp(tick)->base; - /* XXX: Do we want to check anything further besides this? */ - if (fiio->major < 7) { - SDT_PROBE2(fuse, , internal, trace, 1, + data->fuse_libabi_major = fiio->major; + data->fuse_libabi_minor = fiio->minor; + if (!fuse_libabi_geq(data, 7, 4)) { + /* + * With a little work we could support servers as old as 7.1. + * But there would be little payoff. + */ + SDT_PROBE2(fusefs, , internal, trace, 1, "userpace version too low"); err = EPROTONOSUPPORT; goto out; } - data->fuse_libabi_major = fiio->major; - data->fuse_libabi_minor = fiio->minor; if (fuse_libabi_geq(data, 7, 5)) { - if (fticket_resp(tick)->len == sizeof(struct fuse_init_out)) { + if (fticket_resp(tick)->len == sizeof(struct fuse_init_out) || + fticket_resp(tick)->len == FUSE_COMPAT_22_INIT_OUT_SIZE) { data->max_write = fiio->max_write; + if (fiio->flags & FUSE_ASYNC_READ) + data->dataflags |= FSESS_ASYNC_READ; + if (fiio->flags & FUSE_POSIX_LOCKS) + data->dataflags |= FSESS_POSIX_LOCKS; + if (fiio->flags & FUSE_EXPORT_SUPPORT) + data->dataflags |= FSESS_EXPORT_SUPPORT; + /* + * Don't bother to check FUSE_BIG_WRITES, because it's + * redundant with max_write + */ + /* + * max_background and congestion_threshold are not + * implemented + */ } else { err = EINVAL; } } else { - /* Old fix values */ + /* Old fixed values */ data->max_write = 4096; } + if (fuse_libabi_geq(data, 7, 6)) + data->max_readahead_blocks = fiio->max_readahead / maxbcachebuf; + + if (!fuse_libabi_geq(data, 7, 7)) + fsess_set_notimpl(data->mp, FUSE_INTERRUPT); + + if (!fuse_libabi_geq(data, 7, 8)) { + fsess_set_notimpl(data->mp, FUSE_BMAP); + fsess_set_notimpl(data->mp, FUSE_DESTROY); + } + + if (fuse_libabi_geq(data, 7, 23) && fiio->time_gran >= 1 && + fiio->time_gran <= 1000000000) + data->time_gran = fiio->time_gran; + else + data->time_gran = 1; + + if (!fuse_libabi_geq(data, 7, 23)) + data->cache_mode = fuse_data_cache_mode; + else if (fiio->flags & FUSE_WRITEBACK_CACHE) + data->cache_mode = FUSE_CACHE_WB; + else + data->cache_mode = FUSE_CACHE_WT; + out: if (err) { fdata_set_dead(data); @@ -669,14 +1035,156 @@ fiii = fdi.indata; fiii->major = FUSE_KERNEL_VERSION; fiii->minor = FUSE_KERNEL_MINOR_VERSION; - fiii->max_readahead = FUSE_DEFAULT_IOSIZE * 16; - fiii->flags = 0; + /* + * fusefs currently reads ahead no more than one cache block at a time. + * See fuse_read_biobackend + */ + fiii->max_readahead = maxbcachebuf; + /* + * Unsupported features: + * FUSE_FILE_OPS: No known FUSE server or client supports it + * FUSE_ATOMIC_O_TRUNC: our VFS cannot support it + * FUSE_DONT_MASK: unlike Linux, FreeBSD always applies the umask, even + * when default ACLs are in use. + * FUSE_SPLICE_WRITE, FUSE_SPLICE_MOVE, FUSE_SPLICE_READ: FreeBSD + * doesn't have splice(2). + * FUSE_FLOCK_LOCKS: not yet implemented + * FUSE_HAS_IOCTL_DIR: not yet implemented + * FUSE_AUTO_INVAL_DATA: not yet implemented + * FUSE_DO_READDIRPLUS: not yet implemented + * FUSE_READDIRPLUS_AUTO: not yet implemented + * FUSE_ASYNC_DIO: not yet implemented + * FUSE_NO_OPEN_SUPPORT: not yet implemented + */ + fiii->flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_EXPORT_SUPPORT + | FUSE_BIG_WRITES | FUSE_WRITEBACK_CACHE; fuse_insert_callback(fdi.tick, fuse_internal_init_callback); - fuse_insert_message(fdi.tick); + fuse_insert_message(fdi.tick, false); fdisp_destroy(&fdi); } +/* + * Send a FUSE_SETATTR operation with no permissions checks. If cred is NULL, + * send the request with root credentials + */ +int fuse_internal_setattr(struct vnode *vp, struct vattr *vap, + struct thread *td, struct ucred *cred) +{ + struct fuse_vnode_data *fvdat; + struct fuse_dispatcher fdi; + struct fuse_setattr_in *fsai; + struct mount *mp; + pid_t pid = td->td_proc->p_pid; + struct fuse_data *data; + int dataflags; + int err = 0; + enum vtype vtyp; + int sizechanged = -1; + uint64_t newsize = 0; + + mp = vnode_mount(vp); + fvdat = VTOFUD(vp); + data = fuse_get_mpdata(mp); + dataflags = data->dataflags; + + fdisp_init(&fdi, sizeof(*fsai)); + fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred); + if (!cred) { + fdi.finh->uid = 0; + fdi.finh->gid = 0; + } + fsai = fdi.indata; + fsai->valid = 0; + + if (vap->va_uid != (uid_t)VNOVAL) { + fsai->uid = vap->va_uid; + fsai->valid |= FATTR_UID; + } + if (vap->va_gid != (gid_t)VNOVAL) { + fsai->gid = vap->va_gid; + fsai->valid |= FATTR_GID; + } + if (vap->va_size != VNOVAL) { + struct fuse_filehandle *fufh = NULL; + + /*Truncate to a new value. */ + fsai->size = vap->va_size; + sizechanged = 1; + newsize = vap->va_size; + fsai->valid |= FATTR_SIZE; + + fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); + if (fufh) { + fsai->fh = fufh->fh_id; + fsai->valid |= FATTR_FH; + } + VTOFUD(vp)->flag &= ~FN_SIZECHANGE; + } + if (vap->va_atime.tv_sec != VNOVAL) { + fsai->atime = vap->va_atime.tv_sec; + fsai->atimensec = vap->va_atime.tv_nsec; + fsai->valid |= FATTR_ATIME; + if (vap->va_vaflags & VA_UTIMES_NULL) + fsai->valid |= FATTR_ATIME_NOW; + } + if (vap->va_mtime.tv_sec != VNOVAL) { + fsai->mtime = vap->va_mtime.tv_sec; + fsai->mtimensec = vap->va_mtime.tv_nsec; + fsai->valid |= FATTR_MTIME; + if (vap->va_vaflags & VA_UTIMES_NULL) + fsai->valid |= FATTR_MTIME_NOW; + } else if (fvdat->flag & FN_MTIMECHANGE) { + fsai->mtime = fvdat->cached_attrs.va_mtime.tv_sec; + fsai->mtimensec = fvdat->cached_attrs.va_mtime.tv_nsec; + fsai->valid |= FATTR_MTIME; + } + if (fuse_libabi_geq(data, 7, 23) && fvdat->flag & FN_CTIMECHANGE) { + fsai->ctime = fvdat->cached_attrs.va_ctime.tv_sec; + fsai->ctimensec = fvdat->cached_attrs.va_ctime.tv_nsec; + fsai->valid |= FATTR_CTIME; + } + if (vap->va_mode != (mode_t)VNOVAL) { + fsai->mode = vap->va_mode & ALLPERMS; + fsai->valid |= FATTR_MODE; + } + if (!fsai->valid) { + goto out; + } + + if ((err = fdisp_wait_answ(&fdi))) + goto out; + vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode); + + if (vnode_vtype(vp) != vtyp) { + if (vnode_vtype(vp) == VNON && vtyp != VNON) { + SDT_PROBE2(fusefs, , internal, trace, 1, "FUSE: Dang! " + "vnode_vtype is VNON and vtype isn't."); + } else { + /* + * STALE vnode, ditch + * + * The vnode has changed its type "behind our back". + * There's nothing really we can do, so let us just + * force an internal revocation and tell the caller to + * try again, if interested. + */ + fuse_internal_vnode_disappear(vp); + err = EAGAIN; + } + } + if (err == 0) { + struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ; + fuse_vnode_undirty_cached_timestamps(vp); + fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, + fao->attr_valid_nsec, NULL); + } + +out: + fdisp_destroy(&fdi); + return err; +} + #ifdef ZERO_PAD_INCOMPLETE_BUFS static int isbzero(void *buf, size_t len) @@ -692,3 +1200,19 @@ } #endif + +void +fuse_internal_init(void) +{ + fuse_lookup_cache_misses = counter_u64_alloc(M_WAITOK); + counter_u64_zero(fuse_lookup_cache_misses); + fuse_lookup_cache_hits = counter_u64_alloc(M_WAITOK); + counter_u64_zero(fuse_lookup_cache_hits); +} + +void +fuse_internal_destroy(void) +{ + counter_u64_free(fuse_lookup_cache_hits); + counter_u64_free(fuse_lookup_cache_misses); +} Index: sys/fs/fuse/fuse_io.h =================================================================== --- sys/fs/fuse/fuse_io.h +++ sys/fs/fuse/fuse_io.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -61,7 +66,7 @@ #define _FUSE_IO_H_ int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, - struct ucred *cred); + struct ucred *cred, pid_t pid); int fuse_io_strategy(struct vnode *vp, struct buf *bp); int fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td); int fuse_io_invalbuf(struct vnode *vp, struct thread *td); Index: sys/fs/fuse/fuse_io.c =================================================================== --- sys/fs/fuse/fuse_io.c +++ sys/fs/fuse/fuse_io.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -72,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +89,7 @@ #include #include #include +#include #include #include @@ -98,45 +105,108 @@ #include "fuse_ipc.h" #include "fuse_io.h" -SDT_PROVIDER_DECLARE(fuse); /* + * Set in a struct buf to indicate that the write came from the buffer cache + * and the originating cred and pid are no longer known. + */ +#define B_FUSEFS_WRITE_CACHE B_FS_FLAG1 + +SDT_PROVIDER_DECLARE(fusefs); +/* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , io, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , io, trace, "int", "char*"); +static void +fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred, + struct thread *td); static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh); static int -fuse_read_biobackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh); +fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, + struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid); static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); + struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, + int ioflag, bool pages); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); + struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid); -SDT_PROBE_DEFINE5(fuse, , io, io_dispatch, "struct vnode*", "struct uio*", +/* + * FreeBSD clears the SUID and SGID bits on any write by a non-root user. + */ +static void +fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred, + struct thread *td) +{ + struct fuse_data *data; + struct mount *mp; + struct vattr va; + int dataflags; + + mp = vnode_mount(vp); + data = fuse_get_mpdata(mp); + dataflags = data->dataflags; + + if (dataflags & FSESS_DEFAULT_PERMISSIONS) { + if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { + fuse_internal_getattr(vp, &va, cred, td); + if (va.va_mode & (S_ISUID | S_ISGID)) { + mode_t mode = va.va_mode & ~(S_ISUID | S_ISGID); + /* Clear all vattr fields except mode */ + vattr_null(&va); + va.va_mode = mode; + + /* + * Ignore fuse_internal_setattr's return value, + * because at this point the write operation has + * already succeeded and we don't want to return + * failing status for that. + */ + (void)fuse_internal_setattr(vp, &va, td, NULL); + } + } + } +} + +SDT_PROBE_DEFINE5(fusefs, , io, io_dispatch, "struct vnode*", "struct uio*", "int", "struct ucred*", "struct fuse_filehandle*"); +SDT_PROBE_DEFINE4(fusefs, , io, io_dispatch_filehandles_closed, "struct vnode*", + "struct uio*", "int", "struct ucred*"); int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, - struct ucred *cred) + struct ucred *cred, pid_t pid) { struct fuse_filehandle *fufh; int err, directio; + int fflag; + bool closefufh = false; MPASS(vp->v_type == VREG || vp->v_type == VDIR); - err = fuse_filehandle_getrw(vp, - (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); - if (err) { + fflag = (uio->uio_rw == UIO_READ) ? FREAD : FWRITE; + err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); + if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { + /* + * nfsd will do I/O without first doing VOP_OPEN. We + * must implicitly open the file here + */ + err = fuse_filehandle_open(vp, fflag, &fufh, curthread, cred); + closefufh = true; + } + else if (err) { + SDT_PROBE4(fusefs, , io, io_dispatch_filehandles_closed, + vp, uio, ioflag, cred); printf("FUSE: io dispatch: filehandles are closed\n"); return err; } - SDT_PROBE5(fuse, , io, io_dispatch, vp, uio, ioflag, cred, fufh); + if (err) + goto out; + SDT_PROBE5(fusefs, , io, io_dispatch, vp, uio, ioflag, cred, fufh); /* * Ideally, when the daemon asks for direct io at open time, the @@ -153,108 +223,136 @@ switch (uio->uio_rw) { case UIO_READ: if (directio) { - SDT_PROBE2(fuse, , io, trace, 1, + SDT_PROBE2(fusefs, , io, trace, 1, "direct read of vnode"); err = fuse_read_directbackend(vp, uio, cred, fufh); } else { - SDT_PROBE2(fuse, , io, trace, 1, + SDT_PROBE2(fusefs, , io, trace, 1, "buffered read of vnode"); - err = fuse_read_biobackend(vp, uio, cred, fufh); + err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh, + pid); } break; case UIO_WRITE: - /* - * Kludge: simulate write-through caching via write-around - * caching. Same effect, as far as never caching dirty data, - * but slightly pessimal in that newly written data is not - * cached. - */ - if (directio || fuse_data_cache_mode == FUSE_CACHE_WT) { - SDT_PROBE2(fuse, , io, trace, 1, + fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); + if (directio) { + const int iosize = fuse_iosize(vp); + off_t start, end, filesize; + + SDT_PROBE2(fusefs, , io, trace, 1, "direct write of vnode"); - err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag); + + err = fuse_vnode_size(vp, &filesize, cred, curthread); + if (err) + goto out; + + start = uio->uio_offset; + end = start + uio->uio_resid; + KASSERT((ioflag & (IO_VMIO | IO_DIRECT)) != + (IO_VMIO | IO_DIRECT), + ("IO_DIRECT used for a cache flush?")); + /* Invalidate the write cache when writing directly */ + v_inval_buf_range(vp, start, end, iosize); + err = fuse_write_directbackend(vp, uio, cred, fufh, + filesize, ioflag, false); } else { - SDT_PROBE2(fuse, , io, trace, 1, + SDT_PROBE2(fusefs, , io, trace, 1, "buffered write of vnode"); - err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag); + if (!fsess_opt_writeback(vnode_mount(vp))) + ioflag |= IO_SYNC; + err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag, + pid); } + fuse_io_clear_suid_on_write(vp, cred, uio->uio_td); break; default: panic("uninterpreted mode passed to fuse_io_dispatch"); } +out: + if (closefufh) + fuse_filehandle_close(vp, fufh, curthread, cred); + return (err); } -SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_start, "int", "int", "int"); -SDT_PROBE_DEFINE2(fuse, , io, read_bio_backend_feed, "int", "int"); -SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_end, "int", "ssize_t", "int"); +SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int"); +SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "struct buf*"); +SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int", + "struct buf*"); static int -fuse_read_biobackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh) +fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, + struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid) { struct buf *bp; - daddr_t lbn; - int bcount; - int err = 0, n = 0, on = 0; + struct mount *mp; + struct fuse_data *data; + daddr_t lbn, nextlbn; + int bcount, nextsize; + int err, n = 0, on = 0, seqcount; off_t filesize; const int biosize = fuse_iosize(vp); + mp = vnode_mount(vp); + data = fuse_get_mpdata(mp); - if (uio->uio_resid == 0) - return (0); if (uio->uio_offset < 0) return (EINVAL); - bcount = biosize; - filesize = VTOFUD(vp)->filesize; + seqcount = ioflag >> IO_SEQSHIFT; - do { + err = fuse_vnode_size(vp, &filesize, cred, curthread); + if (err) + return err; + + for (err = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if (fuse_isdeadfs(vp)) { err = ENXIO; break; } + if (filesize - uio->uio_offset <= 0) + break; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); - SDT_PROBE3(fuse, , io, read_bio_backend_start, - biosize, (int)lbn, on); - - /* - * Obtain the buffer cache block. Figure out the buffer size - * when we are at EOF. If we are modifying the size of the - * buffer based on an EOF condition we need to hold - * nfs_rslock() through obtaining the buffer to prevent - * a potential writer-appender from messing with n_size. - * Otherwise we may accidentally truncate the buffer and - * lose dirty data. - * - * Note that bcount is *not* DEV_BSIZE aligned. - */ if ((off_t)lbn * biosize >= filesize) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { bcount = filesize - (off_t)lbn *biosize; + } else { + bcount = biosize; } - bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); + nextlbn = lbn + 1; + nextsize = MIN(biosize, filesize - nextlbn * biosize); - if (!bp) - return (EINTR); + SDT_PROBE4(fusefs, , io, read_bio_backend_start, + biosize, (int)lbn, on, bcount); - /* - * If B_CACHE is not set, we must issue the read. If this - * fails, we return an error. - */ + if (bcount < biosize) { + /* If near EOF, don't do readahead */ + err = bread(vp, lbn, bcount, NOCRED, &bp); + } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { + /* Try clustered read */ + long totread = uio->uio_resid + on; + seqcount = MIN(seqcount, + data->max_readahead_blocks + 1); + err = cluster_read(vp, filesize, lbn, bcount, NOCRED, + totread, seqcount, 0, &bp); + } else if (seqcount > 1 && data->max_readahead_blocks >= 1) { + /* Try non-clustered readahead */ + err = breadn(vp, lbn, bcount, &nextlbn, &nextsize, 1, + NOCRED, &bp); + } else { + /* Just read what was requested */ + err = bread(vp, lbn, bcount, NOCRED, &bp); + } - if ((bp->b_flags & B_CACHE) == 0) { - bp->b_iocmd = BIO_READ; - vfs_busy_pages(bp, 0); - err = fuse_io_strategy(vp, bp); - if (err) { - brelse(bp); - return (err); - } + if (err) { + brelse(bp); + bp = NULL; + break; } + /* * on is the offset into the current bp. Figure out how many * bytes we can copy out of the bp. Note that bcount is @@ -264,33 +362,41 @@ */ n = 0; - if (on < bcount) - n = MIN((unsigned)(bcount - on), uio->uio_resid); + if (on < bcount - bp->b_resid) + n = MIN((unsigned)(bcount - bp->b_resid - on), + uio->uio_resid); if (n > 0) { - SDT_PROBE2(fuse, , io, read_bio_backend_feed, - n, n + (int)bp->b_resid); + SDT_PROBE2(fusefs, , io, read_bio_backend_feed, n, bp); err = uiomove(bp->b_data + on, n, uio); } - brelse(bp); - SDT_PROBE3(fuse, , io, read_bio_backend_end, err, - uio->uio_resid, n); - } while (err == 0 && uio->uio_resid > 0 && n > 0); + vfs_bio_brelse(bp, ioflag); + SDT_PROBE4(fusefs, , io, read_bio_backend_end, err, + uio->uio_resid, n, bp); + if (bp->b_resid > 0) { + /* Short read indicates EOF */ + break; + } + } return (err); } -SDT_PROBE_DEFINE1(fuse, , io, read_directbackend_start, "struct fuse_read_in*"); -SDT_PROBE_DEFINE2(fuse, , io, read_directbackend_complete, - "struct fuse_dispatcher*", "struct uio*"); +SDT_PROBE_DEFINE1(fusefs, , io, read_directbackend_start, + "struct fuse_read_in*"); +SDT_PROBE_DEFINE3(fusefs, , io, read_directbackend_complete, + "struct fuse_dispatcher*", "struct fuse_read_in*", "struct uio*"); static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh) { + struct fuse_data *data; struct fuse_dispatcher fdi; struct fuse_read_in *fri; int err = 0; + data = fuse_get_mpdata(vp->v_mount); + if (uio->uio_resid == 0) return (0); @@ -312,19 +418,29 @@ fri->offset = uio->uio_offset; fri->size = MIN(uio->uio_resid, fuse_get_mpdata(vp->v_mount)->max_read); + if (fuse_libabi_geq(data, 7, 9)) { + /* See comment regarding FUSE_WRITE_LOCKOWNER */ + fri->read_flags = 0; + fri->flags = fufh_type_2_fflags(fufh->fufh_type); + } - SDT_PROBE1(fuse, , io, read_directbackend_start, fri); + SDT_PROBE1(fusefs, , io, read_directbackend_start, fri); if ((err = fdisp_wait_answ(&fdi))) goto out; - SDT_PROBE2(fuse, , io, read_directbackend_complete, - fdi.iosize, uio); + SDT_PROBE3(fusefs, , io, read_directbackend_complete, + &fdi, fri, uio); if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) break; - if (fdi.iosize < fri->size) + if (fdi.iosize < fri->size) { + /* + * Short read. Should only happen at EOF or with + * direct io. + */ break; + } } out: @@ -334,25 +450,57 @@ static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) + struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, + int ioflag, bool pages) { struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_data *data; struct fuse_write_in *fwi; + struct fuse_write_out *fwo; struct fuse_dispatcher fdi; size_t chunksize; + void *fwi_data; + off_t as_written_offset; int diff; int err = 0; + bool direct_io = fufh->fuse_open_flags & FOPEN_DIRECT_IO; + bool wrote_anything = false; + uint32_t write_flags; + data = fuse_get_mpdata(vp->v_mount); + + /* + * Don't set FUSE_WRITE_LOCKOWNER in write_flags. It can't be set + * accurately when using POSIX AIO, libfuse doesn't use it, and I'm not + * aware of any file systems that do. It was an attempt to add + * Linux-style mandatory locking to the FUSE protocol, but mandatory + * locking is deprecated even on Linux. See Linux commit + * f33321141b273d60cbb3a8f56a5489baad82ba5e . + */ + /* + * Set FUSE_WRITE_CACHE whenever we don't know the uid, gid, and/or pid + * that originated a write. For example when writing from the + * writeback cache. I don't know of a single file system that cares, + * but the protocol says we're supposed to do this. + */ + write_flags = !pages && ( + (ioflag & IO_DIRECT) || + !fsess_opt_datacache(vnode_mount(vp)) || + !fsess_opt_writeback(vnode_mount(vp))) ? 0 : FUSE_WRITE_CACHE; + if (uio->uio_resid == 0) return (0); + if (ioflag & IO_APPEND) - uio_setoffset(uio, fvdat->filesize); + uio_setoffset(uio, filesize); + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) + return (EFBIG); + fdisp_init(&fdi, 0); while (uio->uio_resid > 0) { - chunksize = MIN(uio->uio_resid, - fuse_get_mpdata(vp->v_mount)->max_write); + chunksize = MIN(uio->uio_resid, data->max_write); fdi.iosize = sizeof(*fwi) + chunksize; fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); @@ -361,79 +509,140 @@ fwi->fh = fufh->fh_id; fwi->offset = uio->uio_offset; fwi->size = chunksize; + fwi->write_flags = write_flags; + if (fuse_libabi_geq(data, 7, 9)) { + fwi->flags = fufh_type_2_fflags(fufh->fufh_type); + fwi_data = (char *)fdi.indata + sizeof(*fwi); + } else { + fwi_data = (char *)fdi.indata + + FUSE_COMPAT_WRITE_IN_SIZE; + } - if ((err = uiomove((char *)fdi.indata + sizeof(*fwi), - chunksize, uio))) + if ((err = uiomove(fwi_data, chunksize, uio))) break; - if ((err = fdisp_wait_answ(&fdi))) +retry: + err = fdisp_wait_answ(&fdi); + if (err == ERESTART || err == EINTR || err == EWOULDBLOCK) { + /* + * Rewind the uio so dofilewrite will know it's + * incomplete + */ + uio->uio_resid += fwi->size; + uio->uio_offset -= fwi->size; + /* + * Change ERESTART into EINTR because we can't rewind + * uio->uio_iov. Basically, once uiomove(9) has been + * called, it's impossible to restart a syscall. + */ + if (err == ERESTART) + err = EINTR; break; + } else if (err) { + break; + } else { + wrote_anything = true; + } + fwo = ((struct fuse_write_out *)fdi.answ); + /* Adjust the uio in the case of short writes */ - diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size; + diff = fwi->size - fwo->size; + as_written_offset = uio->uio_offset - diff; + + if (as_written_offset - diff > filesize) + fuse_vnode_setsize(vp, as_written_offset); + if (as_written_offset - diff >= filesize) + fvdat->flag &= ~FN_SIZECHANGE; + if (diff < 0) { + printf("WARNING: misbehaving FUSE filesystem " + "wrote more data than we provided it\n"); err = EINVAL; break; - } else if (diff > 0 && !(ioflag & IO_DIRECT)) { - /* - * XXX We really should be directly checking whether - * the file was opened with FOPEN_DIRECT_IO, not - * IO_DIRECT. IO_DIRECT can be set in multiple ways. - */ - SDT_PROBE2(fuse, , io, trace, 1, - "misbehaving filesystem: short writes are only " - "allowed with direct_io"); + } else if (diff > 0) { + /* Short write */ + if (!direct_io) { + printf("WARNING: misbehaving FUSE filesystem: " + "short writes are only allowed with " + "direct_io\n"); + } + if (ioflag & IO_DIRECT) { + /* Return early */ + uio->uio_resid += diff; + uio->uio_offset -= diff; + break; + } else { + /* Resend the unwritten portion of data */ + fdi.iosize = sizeof(*fwi) + diff; + /* Refresh fdi without clearing data buffer */ + fdisp_refresh_vp(&fdi, FUSE_WRITE, vp, + uio->uio_td, cred); + fwi = fdi.indata; + MPASS2(fwi == fdi.indata, "FUSE dispatcher " + "reallocated despite no increase in " + "size?"); + void *src = (char*)fwi_data + fwo->size; + memmove(fwi_data, src, diff); + fwi->fh = fufh->fh_id; + fwi->offset = as_written_offset; + fwi->size = diff; + fwi->write_flags = write_flags; + goto retry; + } } - uio->uio_resid += diff; - uio->uio_offset -= diff; - - if (uio->uio_offset > fvdat->filesize && - fuse_data_cache_mode != FUSE_CACHE_UC) { - fuse_vnode_setsize(vp, uio->uio_offset); - fvdat->flag &= ~FN_SIZECHANGE; - } } fdisp_destroy(&fdi); + if (wrote_anything) + fuse_vnode_undirty_cached_timestamps(vp); + return (err); } -SDT_PROBE_DEFINE6(fuse, , io, write_biobackend_start, "int64_t", "int", "int", +SDT_PROBE_DEFINE6(fusefs, , io, write_biobackend_start, "int64_t", "int", "int", "struct uio*", "int", "bool"); -SDT_PROBE_DEFINE2(fuse, , io, write_biobackend_append_race, "long", "int"); +SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_append_race, "long", "int"); +SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_issue, "int", "struct buf*"); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) + struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct buf *bp; daddr_t lbn; + off_t filesize; int bcount; - int n, on, err = 0; + int n, on, seqcount, err = 0; + bool last_page; const int biosize = fuse_iosize(vp); - KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); + seqcount = ioflag >> IO_SEQSHIFT; + + KASSERT(uio->uio_rw == UIO_WRITE, ("fuse_write_biobackend mode")); if (vp->v_type != VREG) return (EIO); if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (0); + + err = fuse_vnode_size(vp, &filesize, cred, curthread); + if (err) + return err; + if (ioflag & IO_APPEND) - uio_setoffset(uio, fvdat->filesize); + uio_setoffset(uio, filesize); - /* - * Find all of this file's B_NEEDCOMMIT buffers. If our writes - * would exceed the local maximum per-file write commit size when - * combined with those, we must decide whether to flush, - * go synchronous, or return err. We don't bother checking - * IO_UNIT -- we just make all writes atomic anyway, as there's - * no point optimizing for something that really won't ever happen. - */ + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) + return (EFBIG); + do { + bool direct_append, extending; + if (fuse_isdeadfs(vp)) { err = ENXIO; break; @@ -443,66 +652,60 @@ n = MIN((unsigned)(biosize - on), uio->uio_resid); again: - /* - * Handle direct append and file extension cases, calculate - * unaligned buffer size. - */ - if (uio->uio_offset == fvdat->filesize && n) { - /* - * Get the buffer (in its pre-append state to maintain - * B_CACHE if it was previously set). Resize the - * nfsnode after we have locked the buffer to prevent - * readers from reading garbage. - */ - bcount = on; - SDT_PROBE6(fuse, , io, write_biobackend_start, - lbn, on, n, uio, bcount, true); - bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); - + /* Get or create a buffer for the write */ + direct_append = uio->uio_offset == filesize && n; + if (uio->uio_offset + n < filesize) { + extending = false; + if ((off_t)(lbn + 1) * biosize < filesize) { + /* Not the file's last block */ + bcount = biosize; + } else { + /* The file's last block */ + bcount = filesize - (off_t)lbn * biosize; + } + } else { + extending = true; + bcount = on + n; + } + if (howmany(((off_t)lbn * biosize + on + n - 1), PAGE_SIZE) >= + howmany(filesize, PAGE_SIZE)) + last_page = true; + else + last_page = false; + if (direct_append) { + /* + * Take care to preserve the buffer's B_CACHE state so + * as not to cause an unnecessary read. + */ + bp = getblk(vp, lbn, on, PCATCH, 0, 0); if (bp != NULL) { - long save; - - err = fuse_vnode_setsize(vp, - uio->uio_offset + n); - if (err) { - brelse(bp); - break; - } - save = bp->b_flags & B_CACHE; - bcount += n; + uint32_t save = bp->b_flags & B_CACHE; allocbuf(bp, bcount); bp->b_flags |= save; } } else { - /* - * Obtain the locked cache block first, and then - * adjust the file's size as appropriate. - */ - bcount = on + n; - if ((off_t)lbn * biosize + bcount < fvdat->filesize) { - if ((off_t)(lbn + 1) * biosize < fvdat->filesize) - bcount = biosize; - else - bcount = fvdat->filesize - - (off_t)lbn *biosize; - } - SDT_PROBE6(fuse, , io, write_biobackend_start, - lbn, on, n, uio, bcount, false); bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); - if (bp && uio->uio_offset + n > fvdat->filesize) { - err = fuse_vnode_setsize(vp, - uio->uio_offset + n); - if (err) { - brelse(bp); - break; - } - } } - if (!bp) { err = EINTR; break; } + if (extending) { + /* + * Extend file _after_ locking buffer so we won't race + * with other readers + */ + err = fuse_vnode_setsize(vp, uio->uio_offset + n); + filesize = uio->uio_offset + n; + fvdat->flag |= FN_SIZECHANGE; + if (err) { + brelse(bp); + break; + } + } + + SDT_PROBE6(fusefs, , io, write_biobackend_start, + lbn, on, n, uio, bcount, direct_append); /* * Issue a READ if B_CACHE is not set. In special-append * mode, B_CACHE is based on the buffer prior to the write @@ -535,6 +738,21 @@ brelse(bp); break; } + if (bp->b_resid > 0) { + /* + * Short read indicates EOF. Update file size + * from the server and try again. + */ + SDT_PROBE2(fusefs, , io, trace, 1, + "Short read during a RMW"); + brelse(bp); + err = fuse_vnode_size(vp, &filesize, cred, + curthread); + if (err) + break; + else + goto again; + } } if (bp->b_wcred == NOCRED) bp->b_wcred = crhold(cred); @@ -547,9 +765,8 @@ * If the chopping creates a reverse-indexed or degenerate * situation with dirtyoff/end, we 0 both of them. */ - if (bp->b_dirtyend > bcount) { - SDT_PROBE2(fuse, , io, write_biobackend_append_race, + SDT_PROBE2(fusefs, , io, write_biobackend_append_race, (long)bp->b_blkno * biosize, bp->b_dirtyend - bcount); bp->b_dirtyend = bcount; @@ -582,6 +799,7 @@ * reasons: the only way to know if a write is valid * if its actually written out.) */ + SDT_PROBE2(fusefs, , io, write_biobackend_issue, 0, bp); bwrite(bp); if (bp->b_error == EINTR) { err = EINTR; @@ -591,19 +809,12 @@ } err = uiomove((char *)bp->b_data + on, n, uio); - /* - * Since this block is being modified, it must be written - * again and not just committed. Since write clustering does - * not work for the stage 1 data write, only the stage 2 - * commit rpc, we have to clear B_CLUSTEROK as well. - */ - bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); - if (err) { bp->b_ioflags |= BIO_ERROR; bp->b_error = err; brelse(bp); break; + /* TODO: vfs_bio_clrbuf like ffs_write does? */ } /* * Only update dirtyoff/dirtyend if not a degenerate @@ -619,42 +830,85 @@ } vfs_bio_set_valid(bp, on, n); } - err = bwrite(bp); + + vfs_bio_set_flags(bp, ioflag); + + bp->b_flags |= B_FUSEFS_WRITE_CACHE; + if (ioflag & IO_SYNC) { + SDT_PROBE2(fusefs, , io, write_biobackend_issue, 2, bp); + if (!(ioflag & IO_VMIO)) + bp->b_flags &= ~B_FUSEFS_WRITE_CACHE; + err = bwrite(bp); + } else if (vm_page_count_severe() || + buf_dirty_count_severe() || + (ioflag & IO_ASYNC)) { + bp->b_flags |= B_CLUSTEROK; + SDT_PROBE2(fusefs, , io, write_biobackend_issue, 3, bp); + bawrite(bp); + } else if (on == 0 && n == bcount) { + if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { + bp->b_flags |= B_CLUSTEROK; + SDT_PROBE2(fusefs, , io, write_biobackend_issue, + 4, bp); + cluster_write(vp, bp, filesize, seqcount, 0); + } else { + SDT_PROBE2(fusefs, , io, write_biobackend_issue, + 5, bp); + bawrite(bp); + } + } else if (ioflag & IO_DIRECT) { + bp->b_flags |= B_CLUSTEROK; + SDT_PROBE2(fusefs, , io, write_biobackend_issue, 6, bp); + bawrite(bp); + } else { + bp->b_flags &= ~B_CLUSTEROK; + SDT_PROBE2(fusefs, , io, write_biobackend_issue, 7, bp); + bdwrite(bp); + } if (err) break; } while (uio->uio_resid > 0 && n > 0); - if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0) - fuse_vnode_savesize(vp, cred); - return (err); } int fuse_io_strategy(struct vnode *vp, struct buf *bp) { - struct fuse_filehandle *fufh; struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_filehandle *fufh; struct ucred *cred; struct uio *uiop; struct uio uio; struct iovec io; + off_t filesize; int error = 0; + int fflag; + /* We don't know the true pid when we're dealing with the cache */ + pid_t pid = 0; const int biosize = fuse_iosize(vp); MPASS(vp->v_type == VREG || vp->v_type == VDIR); MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); - error = fuse_filehandle_getrw(vp, - (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); + fflag = bp->b_iocmd == BIO_READ ? FREAD : FWRITE; + cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; + error = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); + if (bp->b_iocmd == BIO_READ && error == EBADF) { + /* + * This may be a read-modify-write operation on a cached file + * opened O_WRONLY. The FUSE protocol allows this. + */ + error = fuse_filehandle_get(vp, FWRITE, &fufh, cred, pid); + } if (error) { printf("FUSE: strategy: filehandles are closed\n"); bp->b_ioflags |= BIO_ERROR; bp->b_error = error; + bufdone(bp); return (error); } - cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; uiop = &uio; uiop->uio_iov = &io; @@ -673,40 +927,57 @@ KASSERT(!(bp->b_flags & B_DONE), ("fuse_io_strategy: bp %p already marked done", bp)); if (bp->b_iocmd == BIO_READ) { + ssize_t left; + io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; - uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; + uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize; error = fuse_read_directbackend(vp, uiop, cred, fufh); + /* + * Store the amount we failed to read in the buffer's private + * field, so callers can truncate the file if necessary' + */ - /* XXXCEM: Potentially invalid access to cached_attrs here */ - if ((!error && uiop->uio_resid) || - (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO && - uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 && - uiop->uio_offset >= fvdat->cached_attrs.va_size)) { - /* - * If we had a short read with no error, we must have - * hit a file hole. We should zero-fill the remainder. - * This can also occur if the server hits the file EOF. - * - * Holes used to be able to occur due to pending - * writes, but that is not possible any longer. - */ + if (!error && uiop->uio_resid) { int nread = bp->b_bcount - uiop->uio_resid; - int left = uiop->uio_resid; + left = uiop->uio_resid; + bzero((char *)bp->b_data + nread, left); - if (error != 0) { - printf("FUSE: Fix broken io: offset %ju, " - " resid %zd, file size %ju/%ju\n", - (uintmax_t)uiop->uio_offset, - uiop->uio_resid, fvdat->filesize, - fvdat->cached_attrs.va_size); - error = 0; + if ((fvdat->flag & FN_SIZECHANGE) == 0) { + /* + * A short read with no error, when not using + * direct io, and when no writes are cached, + * indicates EOF caused by a server-side + * truncation. Clear the attr cache so we'll + * pick up the new file size and timestamps. + * + * We must still bzero the remaining buffer so + * uninitialized data doesn't get exposed by a + * future truncate that extends the file. + * + * To prevent lock order problems, we must + * truncate the file upstack, not here. + */ + SDT_PROBE2(fusefs, , io, trace, 1, + "Short read of a clean file"); + fuse_vnode_clear_attr_cache(vp); + } else { + /* + * If dirty writes _are_ cached beyond EOF, + * that indicates a newly created hole that the + * server doesn't know about. Those don't pose + * any problem. + * XXX: we don't currently track whether dirty + * writes are cached beyond EOF, before EOF, or + * both. + */ + SDT_PROBE2(fusefs, , io, trace, 1, + "Short read of a dirty file"); + uiop->uio_resid = 0; } - if (left > 0) - bzero((char *)bp->b_data + nread, left); - uiop->uio_resid = 0; + } if (error) { bp->b_ioflags |= BIO_ERROR; @@ -714,33 +985,33 @@ } } else { /* - * If we only need to commit, try to commit - */ - if (bp->b_flags & B_NEEDCOMMIT) { - SDT_PROBE2(fuse, , io, trace, 1, - "write: B_NEEDCOMMIT flags set"); - } - /* * Setup for actual write */ - if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > - fvdat->filesize) - bp->b_dirtyend = fvdat->filesize - - (off_t)bp->b_blkno * biosize; + error = fuse_vnode_size(vp, &filesize, cred, curthread); + if (error) { + bp->b_ioflags |= BIO_ERROR; + bp->b_error = error; + bufdone(bp); + return (error); + } + if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize) + bp->b_dirtyend = filesize - + (off_t)bp->b_lblkno * biosize; + if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; - uiop->uio_offset = (off_t)bp->b_blkno * biosize + uiop->uio_offset = (off_t)bp->b_lblkno * biosize + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; - error = fuse_write_directbackend(vp, uiop, cred, fufh, 0); + bool pages = bp->b_flags & B_FUSEFS_WRITE_CACHE; + error = fuse_write_directbackend(vp, uiop, cred, fufh, + filesize, 0, pages); - if (error == EINTR || error == ETIMEDOUT - || (!error && (bp->b_flags & B_NEEDCOMMIT))) { - + if (error == EINTR || error == ETIMEDOUT) { bp->b_flags &= ~(B_INVAL | B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); Index: sys/fs/fuse/fuse_ipc.h =================================================================== --- sys/fs/fuse/fuse_ipc.h +++ sys/fs/fuse/fuse_ipc.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -63,6 +68,12 @@ #include #include +enum fuse_data_cache_mode { + FUSE_CACHE_UC, + FUSE_CACHE_WT, + FUSE_CACHE_WB, +}; + struct fuse_iov { void *base; size_t len; @@ -103,6 +114,12 @@ struct fuse_data *tk_data; int tk_flag; u_int tk_refcount; + /* + * If this ticket's operation has been interrupted, this will hold the + * unique value of the FUSE_INTERRUPT operation. Otherwise, it will be + * 0. + */ + uint64_t irq_unique; /* fields for initiating an upgoing message */ struct fuse_iov tk_ms_fiov; @@ -147,16 +164,20 @@ ftick->tk_flag |= FT_ANSW; } +static inline struct fuse_in_header* +fticket_in_header(struct fuse_ticket *ftick) +{ + return (struct fuse_in_header *)(ftick->tk_ms_fiov.base); +} + static inline enum fuse_opcode fticket_opcode(struct fuse_ticket *ftick) { - return (((struct fuse_in_header *)(ftick->tk_ms_fiov.base))->opcode); + return fticket_in_header(ftick)->opcode; } int fticket_pull(struct fuse_ticket *ftick, struct uio *uio); -enum mountpri { FM_NOMOUNTED, FM_PRIMARY, FM_SECONDARY }; - /* * The data representing a FUSE session. */ @@ -170,10 +191,16 @@ struct mtx ms_mtx; STAILQ_HEAD(, fuse_ticket) ms_head; + int ms_count; struct mtx aw_mtx; TAILQ_HEAD(, fuse_ticket) aw_head; + /* + * Holds the next value of the FUSE operation unique value. + * Also, serves as a wakeup channel to prevent any operations from + * being created before INIT completes. + */ u_long ticketer; struct sx rename_lock; @@ -181,6 +208,7 @@ uint32_t fuse_libabi_major; uint32_t fuse_libabi_minor; + uint32_t max_readahead_blocks; uint32_t max_write; uint32_t max_read; uint32_t subtype; @@ -189,34 +217,26 @@ struct selinfo ks_rsel; int daemon_timeout; + unsigned time_gran; uint64_t notimpl; + uint64_t mnt_flag; + enum fuse_data_cache_mode cache_mode; }; #define FSESS_DEAD 0x0001 /* session is to be closed */ -#define FSESS_UNUSED0 0x0002 /* unused */ #define FSESS_INITED 0x0004 /* session has been inited */ #define FSESS_DAEMON_CAN_SPY 0x0010 /* let non-owners access this fs */ /* (and being observed by the daemon) */ #define FSESS_PUSH_SYMLINKS_IN 0x0020 /* prefix absolute symlinks with mp */ #define FSESS_DEFAULT_PERMISSIONS 0x0040 /* kernel does permission checking */ -#define FSESS_NO_ATTRCACHE 0x0080 /* no attribute caching */ -#define FSESS_NO_READAHEAD 0x0100 /* no readaheads */ -#define FSESS_NO_DATACACHE 0x0200 /* disable buffer cache */ -#define FSESS_NO_NAMECACHE 0x0400 /* disable name cache */ -#define FSESS_NO_MMAP 0x0800 /* disable mmap */ -#define FSESS_BROKENIO 0x1000 /* fix broken io */ +#define FSESS_ASYNC_READ 0x1000 /* allow multiple reads of some file */ +#define FSESS_POSIX_LOCKS 0x2000 /* daemon supports POSIX locks */ +#define FSESS_EXPORT_SUPPORT 0x10000 /* daemon supports NFS-style lookups */ +#define FSESS_MNTOPTS_MASK ( \ + FSESS_DAEMON_CAN_SPY | FSESS_PUSH_SYMLINKS_IN | \ + FSESS_DEFAULT_PERMISSIONS) -enum fuse_data_cache_mode { - FUSE_CACHE_UC, - FUSE_CACHE_WT, - FUSE_CACHE_WB, -}; - extern int fuse_data_cache_mode; -extern int fuse_data_cache_invalidate; -extern int fuse_mmap_enable; -extern int fuse_sync_resize; -extern int fuse_fix_broken_io; static inline struct fuse_data * fuse_get_mpdata(struct mount *mp) @@ -245,36 +265,43 @@ { struct fuse_data *data = fuse_get_mpdata(mp); - return (fuse_data_cache_mode != FUSE_CACHE_UC && - (data->dataflags & FSESS_NO_DATACACHE) == 0); + return (data->cache_mode != FUSE_CACHE_UC); } static inline bool fsess_opt_mmap(struct mount *mp) { - struct fuse_data *data = fuse_get_mpdata(mp); - - if (!fuse_mmap_enable || fuse_data_cache_mode == FUSE_CACHE_UC) - return (false); - return ((data->dataflags & (FSESS_NO_DATACACHE | FSESS_NO_MMAP)) == 0); + return (fsess_opt_datacache(mp)); } static inline bool -fsess_opt_brokenio(struct mount *mp) +fsess_opt_writeback(struct mount *mp) { struct fuse_data *data = fuse_get_mpdata(mp); - return (fuse_fix_broken_io || (data->dataflags & FSESS_BROKENIO)); + return (data->cache_mode == FUSE_CACHE_WB); } +/* Insert a new upgoing message */ static inline void fuse_ms_push(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED); refcount_acquire(&ftick->tk_refcount); STAILQ_INSERT_TAIL(&ftick->tk_data->ms_head, ftick, tk_ms_link); + ftick->tk_data->ms_count++; } +/* Insert a new upgoing message to the front of the queue */ +static inline void +fuse_ms_push_head(struct fuse_ticket *ftick) +{ + mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED); + refcount_acquire(&ftick->tk_refcount); + STAILQ_INSERT_HEAD(&ftick->tk_data->ms_head, ftick, tk_ms_link); + ftick->tk_data->ms_count++; +} + static inline struct fuse_ticket * fuse_ms_pop(struct fuse_data *data) { @@ -284,7 +311,9 @@ if ((ftick = STAILQ_FIRST(&data->ms_head))) { STAILQ_REMOVE_HEAD(&data->ms_head, tk_ms_link); + data->ms_count--; #ifdef INVARIANTS + MPASS(data->ms_count >= 0); ftick->tk_ms_link.stqe_next = NULL; #endif } @@ -327,7 +356,7 @@ struct fuse_ticket *fuse_ticket_fetch(struct fuse_data *data); int fuse_ticket_drop(struct fuse_ticket *ftick); void fuse_insert_callback(struct fuse_ticket *ftick, fuse_handler_t *handler); -void fuse_insert_message(struct fuse_ticket *ftick); +void fuse_insert_message(struct fuse_ticket *ftick, bool irq); static inline bool fuse_libabi_geq(struct fuse_data *data, uint32_t abi_maj, uint32_t abi_min) @@ -374,13 +403,15 @@ #endif } +void fdisp_refresh(struct fuse_dispatcher *fdip); + void fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred); -void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, - struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred); - void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, + struct vnode *vp, struct thread *td, struct ucred *cred); + +void fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred); int fdisp_wait_answ(struct fuse_dispatcher *fdip); Index: sys/fs/fuse/fuse_ipc.c =================================================================== --- sys/fs/fuse/fuse_ipc.c +++ sys/fs/fuse/fuse_ipc.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -61,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -84,14 +90,17 @@ #include "fuse_ipc.h" #include "fuse_internal.h" -SDT_PROVIDER_DECLARE(fuse); +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , ipc, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , ipc, trace, "int", "char*"); +static void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, + struct fuse_data *data, uint64_t nid, pid_t pid, struct ucred *cred); +static void fuse_interrupt_send(struct fuse_ticket *otick, int err); static struct fuse_ticket *fticket_alloc(struct fuse_data *data); static void fticket_refresh(struct fuse_ticket *ftick); static void fticket_destroy(struct fuse_ticket *ftick); @@ -104,13 +113,10 @@ static fuse_handler_t fuse_standard_handler; -SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW, 0, "FUSE tunables"); -SYSCTL_STRING(_vfs_fusefs, OID_AUTO, version, CTLFLAG_RD, - FUSE_FREEBSD_VERSION, 0, "fuse-freebsd version"); -static int fuse_ticket_count = 0; +static counter_u64_t fuse_ticket_count; +SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, ticket_count, CTLFLAG_RD, + &fuse_ticket_count, "Number of allocated tickets"); -SYSCTL_INT(_vfs_fusefs, OID_AUTO, ticket_count, CTLFLAG_RW, - &fuse_ticket_count, 0, "number of allocated tickets"); static long fuse_iov_permanent_bufsize = 1 << 19; SYSCTL_LONG(_vfs_fusefs, OID_AUTO, iov_permanent_bufsize, CTLFLAG_RW, @@ -125,25 +131,131 @@ MALLOC_DEFINE(M_FUSEMSG, "fuse_msgbuf", "fuse message buffer"); static uma_zone_t ticket_zone; -static void -fuse_block_sigs(sigset_t *oldset) +/* + * TODO: figure out how to timeout INTERRUPT requests, because the daemon may + * leagally never respond + */ +static int +fuse_interrupt_callback(struct fuse_ticket *tick, struct uio *uio) { - sigset_t newset; + struct fuse_ticket *otick, *x_tick; + struct fuse_interrupt_in *fii; + struct fuse_data *data = tick->tk_data; + bool found = false; - SIGFILLSET(newset); - SIGDELSET(newset, SIGKILL); - if (kern_sigprocmask(curthread, SIG_BLOCK, &newset, oldset, 0)) - panic("%s: Invalid operation for kern_sigprocmask()", - __func__); + fii = (struct fuse_interrupt_in*)((char*)tick->tk_ms_fiov.base + + sizeof(struct fuse_in_header)); + + fuse_lck_mtx_lock(data->aw_mtx); + TAILQ_FOREACH_SAFE(otick, &data->aw_head, tk_aw_link, x_tick) { + if (otick->tk_unique == fii->unique) { + found = true; + break; + } + } + fuse_lck_mtx_unlock(data->aw_mtx); + + if (!found) { + /* Original is already complete. Just return */ + return 0; + } + + /* Clear the original ticket's interrupt association */ + otick->irq_unique = 0; + + if (tick->tk_aw_ohead.error == ENOSYS) { + fsess_set_notimpl(data->mp, FUSE_INTERRUPT); + return 0; + } else if (tick->tk_aw_ohead.error == EAGAIN) { + /* + * There are two reasons we might get this: + * 1) the daemon received the INTERRUPT request before the + * original, or + * 2) the daemon received the INTERRUPT request after it + * completed the original request. + * In the first case we should re-send the INTERRUPT. In the + * second, we should ignore it. + */ + /* Resend */ + fuse_interrupt_send(otick, EINTR); + return 0; + } else { + /* Illegal FUSE_INTERRUPT response */ + return EINVAL; + } } -static void -fuse_restore_sigs(sigset_t *oldset) +/* Interrupt the operation otick. Return err as its error code */ +void +fuse_interrupt_send(struct fuse_ticket *otick, int err) { + struct fuse_dispatcher fdi; + struct fuse_interrupt_in *fii; + struct fuse_in_header *ftick_hdr; + struct fuse_data *data = otick->tk_data; + struct fuse_ticket *tick, *xtick; + struct ucred reused_creds; + gid_t reused_groups[1]; - if (kern_sigprocmask(curthread, SIG_SETMASK, oldset, NULL, 0)) - panic("%s: Invalid operation for kern_sigprocmask()", - __func__); + if (otick->irq_unique == 0) { + /* + * If the daemon hasn't yet received otick, then we can answer + * it ourselves and return. + */ + fuse_lck_mtx_lock(data->ms_mtx); + STAILQ_FOREACH_SAFE(tick, &otick->tk_data->ms_head, tk_ms_link, + xtick) { + if (tick == otick) { + STAILQ_REMOVE(&otick->tk_data->ms_head, tick, + fuse_ticket, tk_ms_link); + otick->tk_data->ms_count--; + otick->tk_ms_link.stqe_next = NULL; + fuse_lck_mtx_unlock(data->ms_mtx); + + fuse_lck_mtx_lock(otick->tk_aw_mtx); + if (!fticket_answered(otick)) { + fticket_set_answered(otick); + otick->tk_aw_errno = err; + wakeup(otick); + } + fuse_lck_mtx_unlock(otick->tk_aw_mtx); + + fuse_ticket_drop(tick); + return; + } + } + fuse_lck_mtx_unlock(data->ms_mtx); + + /* + * If the fuse daemon doesn't support interrupts, then there's + * nothing more that we can do + */ + if (!fsess_isimpl(data->mp, FUSE_INTERRUPT)) + return; + + /* + * If the fuse daemon has already received otick, then we must + * send FUSE_INTERRUPT. + */ + ftick_hdr = fticket_in_header(otick); + reused_creds.cr_uid = ftick_hdr->uid; + reused_groups[0] = ftick_hdr->gid; + reused_creds.cr_groups = reused_groups; + fdisp_init(&fdi, sizeof(*fii)); + fdisp_make_pid(&fdi, FUSE_INTERRUPT, data, ftick_hdr->nodeid, + ftick_hdr->pid, &reused_creds); + + fii = fdi.indata; + fii->unique = otick->tk_unique; + fuse_insert_callback(fdi.tick, fuse_interrupt_callback); + + otick->irq_unique = fdi.tick->tk_unique; + /* Interrupt ops should be delivered ASAP */ + fuse_insert_message(fdi.tick, true); + fdisp_destroy(&fdi); + } else { + /* This ticket has already been interrupted */ + } } void @@ -181,14 +293,19 @@ } fiov->allocated_size = FU_AT_LEAST(size); fiov->credit = fuse_iov_credit; + /* Clear data buffer after reallocation */ + bzero(fiov->base, size); + } else if (size > fiov->len) { + /* Clear newly extended portion of data buffer */ + bzero((char*)fiov->base + fiov->len, size - fiov->len); } fiov->len = size; } +/* Resize the fiov if needed, and clear it's buffer */ void fiov_refresh(struct fuse_iov *fiov) { - bzero(fiov->base, fiov->len); fiov_adjust(fiov, 0); } @@ -211,8 +328,10 @@ if (ftick->tk_unique == 0) ftick->tk_unique = atomic_fetchadd_long(&data->ticketer, 1); + ftick->irq_unique = 0; + refcount_init(&ftick->tk_refcount, 1); - atomic_add_acq_int(&fuse_ticket_count, 1); + counter_u64_add(fuse_ticket_count, 1); return 0; } @@ -227,7 +346,7 @@ FUSE_ASSERT_MS_DONE(ftick); FUSE_ASSERT_AW_DONE(ftick); - atomic_subtract_acq_int(&fuse_ticket_count, 1); + counter_u64_add(fuse_ticket_count, -1); } static int @@ -269,7 +388,7 @@ return uma_zfree(ticket_zone, ftick); } -static inline +static inline void fticket_refresh(struct fuse_ticket *ftick) { @@ -292,15 +411,48 @@ ftick->tk_flag = 0; } +/* Prepar the ticket to be reused, but don't clear its data buffers */ +static inline void +fticket_reset(struct fuse_ticket *ftick) +{ + FUSE_ASSERT_MS_DONE(ftick); + FUSE_ASSERT_AW_DONE(ftick); + + ftick->tk_ms_bufdata = NULL; + ftick->tk_ms_bufsize = 0; + ftick->tk_ms_type = FT_M_FIOV; + + bzero(&ftick->tk_aw_ohead, sizeof(struct fuse_out_header)); + + ftick->tk_aw_errno = 0; + ftick->tk_aw_bufdata = NULL; + ftick->tk_aw_bufsize = 0; + ftick->tk_aw_type = FT_A_FIOV; + + ftick->tk_flag = 0; +} + static int fticket_wait_answer(struct fuse_ticket *ftick) { - sigset_t tset; - int err = 0; + struct thread *td = curthread; + sigset_t blockedset, oldset; + int err = 0, stops_deferred; struct fuse_data *data; + if (fsess_isimpl(ftick->tk_data->mp, FUSE_INTERRUPT)) { + SIGEMPTYSET(blockedset); + } else { + /* May as well block all signals */ + SIGFILLSET(blockedset); + SIGDELSET(blockedset, SIGKILL); + } + stops_deferred = sigdeferstop(SIGDEFERSTOP_SILENT); + kern_sigprocmask(td, SIG_BLOCK, NULL, &oldset, 0); + fuse_lck_mtx_lock(ftick->tk_aw_mtx); +retry: if (fticket_answered(ftick)) { goto out; } @@ -311,11 +463,13 @@ fticket_set_answered(ftick); goto out; } - fuse_block_sigs(&tset); + kern_sigprocmask(td, SIG_BLOCK, &blockedset, NULL, 0); err = msleep(ftick, &ftick->tk_aw_mtx, PCATCH, "fu_ans", data->daemon_timeout * hz); - fuse_restore_sigs(&tset); - if (err == EAGAIN) { /* same as EWOULDBLOCK */ + kern_sigprocmask(td, SIG_SETMASK, &oldset, NULL, 0); + if (err == EWOULDBLOCK) { + SDT_PROBE2(fusefs, , ipc, trace, 3, + "fticket_wait_answer: EWOULDBLOCK"); #ifdef XXXIP /* die conditionally */ if (!fdata_get_dead(data)) { fdata_set_dead(data); @@ -323,14 +477,58 @@ #endif err = ETIMEDOUT; fticket_set_answered(ftick); + } else if ((err == EINTR || err == ERESTART)) { + /* + * Whether we get EINTR or ERESTART depends on whether + * SA_RESTART was set by sigaction(2). + * + * Try to interrupt the operation and wait for an EINTR response + * to the original operation. If the file system does not + * support FUSE_INTERRUPT, then we'll just wait for it to + * complete like normal. If it does support FUSE_INTERRUPT, + * then it will either respond EINTR to the original operation, + * or EAGAIN to the interrupt. + */ + int sig; + bool fatal; + + SDT_PROBE2(fusefs, , ipc, trace, 4, + "fticket_wait_answer: interrupt"); + fuse_lck_mtx_unlock(ftick->tk_aw_mtx); + fuse_interrupt_send(ftick, err); + + PROC_LOCK(td->td_proc); + mtx_lock(&td->td_proc->p_sigacts->ps_mtx); + sig = cursig(td); + fatal = sig_isfatal(td->td_proc, sig); + mtx_unlock(&td->td_proc->p_sigacts->ps_mtx); + PROC_UNLOCK(td->td_proc); + + fuse_lck_mtx_lock(ftick->tk_aw_mtx); + if (!fatal) { + /* + * Block the just-delivered signal while we wait for an + * interrupt response + */ + SIGADDSET(blockedset, sig); + goto retry; + } else { + /* Return immediately for fatal signals */ + } + } else if (err) { + SDT_PROBE2(fusefs, , ipc, trace, 6, + "fticket_wait_answer: other error"); + } else { + SDT_PROBE2(fusefs, , ipc, trace, 7, "fticket_wait_answer: OK"); } out: if (!(err || fticket_answered(ftick))) { - SDT_PROBE2(fuse, , ipc, trace, 1, + SDT_PROBE2(fusefs, , ipc, trace, 1, "FUSE: requester was woken up but still no answer"); err = ENXIO; } fuse_lck_mtx_unlock(ftick->tk_aw_mtx); + sigallowstop(stops_deferred); return err; } @@ -386,6 +584,8 @@ data->fdev = fdev; mtx_init(&data->ms_mtx, "fuse message list mutex", NULL, MTX_DEF); STAILQ_INIT(&data->ms_head); + data->ms_count = 0; + knlist_init_mtx(&data->ks_rsel.si_note, &data->ms_mtx); mtx_init(&data->aw_mtx, "fuse answer list mutex", NULL, MTX_DEF); TAILQ_INIT(&data->aw_head); data->daemoncred = crhold(cred); @@ -405,11 +605,12 @@ return; /* Driving off stage all that stuff thrown at device... */ - mtx_destroy(&data->ms_mtx); - mtx_destroy(&data->aw_mtx); sx_destroy(&data->rename_lock); - crfree(data->daemoncred); + mtx_destroy(&data->aw_mtx); + knlist_delete(&data->ks_rsel.si_note, curthread, 0); + knlist_destroy(&data->ks_rsel.si_note); + mtx_destroy(&data->ms_mtx); free(data, M_FUSEMSG); } @@ -478,8 +679,14 @@ fuse_lck_mtx_unlock(ftick->tk_data->aw_mtx); } +/* + * Insert a new upgoing ticket into the message queue + * + * If urgent is true, insert at the front of the queue. Otherwise, insert in + * FIFO order. + */ void -fuse_insert_message(struct fuse_ticket *ftick) +fuse_insert_message(struct fuse_ticket *ftick, bool urgent) { if (ftick->tk_flag & FT_DIRTY) { panic("FUSE: ticket reused without being refreshed"); @@ -490,9 +697,13 @@ return; } fuse_lck_mtx_lock(ftick->tk_data->ms_mtx); - fuse_ms_push(ftick); + if (urgent) + fuse_ms_push_head(ftick); + else + fuse_ms_push(ftick); wakeup_one(ftick->tk_data); selwakeuppri(&ftick->tk_data->ks_rsel, PZERO + 1); + KNOTE_LOCKED(&ftick->tk_data->ks_rsel.si_note, 0); fuse_lck_mtx_unlock(ftick->tk_data->ms_mtx); } @@ -505,8 +716,21 @@ opcode = fticket_opcode(ftick); switch (opcode) { + case FUSE_BMAP: + err = (blen == sizeof(struct fuse_bmap_out)) ? 0 : EINVAL; + break; + + case FUSE_LINK: case FUSE_LOOKUP: - err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; + case FUSE_MKDIR: + case FUSE_MKNOD: + case FUSE_SYMLINK: + if (fuse_libabi_geq(ftick->tk_data, 7, 9)) { + err = (blen == sizeof(struct fuse_entry_out)) ? + 0 : EINVAL; + } else { + err = (blen == FUSE_COMPAT_ENTRY_OUT_SIZE) ? 0 : EINVAL; + } break; case FUSE_FORGET: @@ -514,29 +738,19 @@ break; case FUSE_GETATTR: - err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL; - break; - case FUSE_SETATTR: - err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL; + if (fuse_libabi_geq(ftick->tk_data, 7, 9)) { + err = (blen == sizeof(struct fuse_attr_out)) ? + 0 : EINVAL; + } else { + err = (blen == FUSE_COMPAT_ATTR_OUT_SIZE) ? 0 : EINVAL; + } break; case FUSE_READLINK: err = (PAGE_SIZE >= blen) ? 0 : EINVAL; break; - case FUSE_SYMLINK: - err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; - break; - - case FUSE_MKNOD: - err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; - break; - - case FUSE_MKDIR: - err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; - break; - case FUSE_UNLINK: err = (blen == 0) ? 0 : EINVAL; break; @@ -549,10 +763,6 @@ err = (blen == 0) ? 0 : EINVAL; break; - case FUSE_LINK: - err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; - break; - case FUSE_OPEN: err = (blen == sizeof(struct fuse_open_out)) ? 0 : EINVAL; break; @@ -607,7 +817,9 @@ break; case FUSE_INIT: - if (blen == sizeof(struct fuse_init_out) || blen == 8) { + if (blen == sizeof(struct fuse_init_out) || + blen == FUSE_COMPAT_INIT_OUT_SIZE || + blen == FUSE_COMPAT_22_INIT_OUT_SIZE) { err = 0; } else { err = EINVAL; @@ -634,15 +846,15 @@ break; case FUSE_GETLK: - panic("FUSE: no response body format check for FUSE_GETLK"); + err = (blen == sizeof(struct fuse_lk_out)) ? 0 : EINVAL; break; case FUSE_SETLK: - panic("FUSE: no response body format check for FUSE_SETLK"); + err = (blen == 0) ? 0 : EINVAL; break; case FUSE_SETLKW: - panic("FUSE: no response body format check for FUSE_SETLKW"); + err = (blen == 0) ? 0 : EINVAL; break; case FUSE_ACCESS: @@ -650,8 +862,13 @@ break; case FUSE_CREATE: - err = (blen == sizeof(struct fuse_entry_out) + - sizeof(struct fuse_open_out)) ? 0 : EINVAL; + if (fuse_libabi_geq(ftick->tk_data, 7, 9)) { + err = (blen == sizeof(struct fuse_entry_out) + + sizeof(struct fuse_open_out)) ? 0 : EINVAL; + } else { + err = (blen == FUSE_COMPAT_ENTRY_OUT_SIZE + + sizeof(struct fuse_open_out)) ? 0 : EINVAL; + } break; case FUSE_DESTROY: @@ -677,7 +894,7 @@ ihead->pid = pid; ihead->uid = cred->cr_uid; - ihead->gid = cred->cr_rgid; + ihead->gid = cred->cr_groups[0]; } /* @@ -705,18 +922,38 @@ return err; } -void -fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, +/* + * Reinitialize a dispatcher from a pid and node id, without resizing or + * clearing its data buffers + */ +static void +fdisp_refresh_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred) { - struct fuse_data *data = fuse_get_mpdata(mp); + MPASS(fdip->tick); + MPASS2(sizeof(fdip->finh) + fdip->iosize <= fdip->tick->tk_ms_fiov.len, + "Must use fdisp_make_pid to increase the size of the fiov"); + fticket_reset(fdip->tick); + FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh, + fdip->indata, fdip->iosize); + + fuse_setup_ihead(fdip->finh, fdip->tick, nid, op, fdip->iosize, pid, + cred); +} + +/* Initialize a dispatcher from a pid and node id */ +static void +fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, + struct fuse_data *data, uint64_t nid, pid_t pid, struct ucred *cred) +{ if (fdip->tick) { fticket_refresh(fdip->tick); } else { fdip->tick = fuse_ticket_fetch(data); } + /* FUSE_DIMALLOC will bzero the fiovs when it enlarges them */ FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh, fdip->indata, fdip->iosize); @@ -727,22 +964,42 @@ fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred) { + struct fuse_data *data = fuse_get_mpdata(mp); RECTIFY_TDCR(td, cred); - return fdisp_make_pid(fdip, op, mp, nid, td->td_proc->p_pid, cred); + return fdisp_make_pid(fdip, op, data, nid, td->td_proc->p_pid, cred); } void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred) { + struct mount *mp = vnode_mount(vp); + struct fuse_data *data = fuse_get_mpdata(mp); + RECTIFY_TDCR(td, cred); - return fdisp_make_pid(fdip, op, vnode_mount(vp), VTOI(vp), + return fdisp_make_pid(fdip, op, data, VTOI(vp), td->td_proc->p_pid, cred); } -SDT_PROBE_DEFINE2(fuse, , ipc, fdisp_wait_answ_error, "char*", "int"); +/* Refresh a fuse_dispatcher so it can be reused, but don't zero its data */ +void +fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, + struct vnode *vp, struct thread *td, struct ucred *cred) +{ + RECTIFY_TDCR(td, cred); + return fdisp_refresh_pid(fdip, op, vnode_mount(vp), VTOI(vp), + td->td_proc->p_pid, cred); +} +void +fdisp_refresh(struct fuse_dispatcher *fdip) +{ + fticket_refresh(fdip->tick); +} + +SDT_PROBE_DEFINE2(fusefs, , ipc, fdisp_wait_answ_error, "char*", "int"); + int fdisp_wait_answ(struct fuse_dispatcher *fdip) { @@ -750,7 +1007,7 @@ fdip->answ_stat = 0; fuse_insert_callback(fdip->tick, fuse_standard_handler); - fuse_insert_message(fdip->tick); + fuse_insert_message(fdip->tick, false); if ((err = fticket_wait_answer(fdip->tick))) { fuse_lck_mtx_lock(fdip->tick->tk_aw_mtx); @@ -761,7 +1018,7 @@ * the standard handler has completed his job. * So we drop the ticket and exit as usual. */ - SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, + SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error, "IPC: interrupted, already answered", err); fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx); goto out; @@ -771,7 +1028,7 @@ * Then by setting the answered flag we get *him* * to drop the ticket. */ - SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, + SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error, "IPC: interrupted, setting to answered", err); fticket_set_answered(fdip->tick); fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx); @@ -779,14 +1036,22 @@ } } - if (fdip->tick->tk_aw_errno) { - SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, + if (fdip->tick->tk_aw_errno == ENOTCONN) { + /* The daemon died while we were waiting for a response */ + err = ENOTCONN; + goto out; + } else if (fdip->tick->tk_aw_errno) { + /* + * There was some sort of communication error with the daemon + * that the client wouldn't understand. + */ + SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error, "IPC: explicit EIO-ing", fdip->tick->tk_aw_errno); err = EIO; goto out; } if ((err = fdip->tick->tk_aw_ohead.error)) { - SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, + SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error, "IPC: setting status", fdip->tick->tk_aw_ohead.error); /* * This means a "proper" fuse syscall error. @@ -815,10 +1080,13 @@ ticket_zone = uma_zcreate("fuse_ticket", sizeof(struct fuse_ticket), fticket_ctor, fticket_dtor, fticket_init, fticket_fini, UMA_ALIGN_PTR, 0); + fuse_ticket_count = counter_u64_alloc(M_WAITOK); + counter_u64_zero(fuse_ticket_count); } void fuse_ipc_destroy(void) { + counter_u64_free(fuse_ticket_count); uma_zdestroy(ticket_zone); } Index: sys/fs/fuse/fuse_kernel.h =================================================================== --- sys/fs/fuse/fuse_kernel.h +++ sys/fs/fuse/fuse_kernel.h @@ -1,6 +1,6 @@ /*-- * This file defines the kernel interface of FUSE - * Copyright (C) 2001-2007 Miklos Szeredi + * Copyright (C) 2001-2008 Miklos Szeredi * * This program can be distributed under the terms of the GNU GPL. * See the file COPYING. @@ -34,69 +34,134 @@ * $FreeBSD$ */ -#ifndef linux -#include -#define __u64 uint64_t -#define __u32 uint32_t -#define __s32 int32_t +/* + * This file defines the kernel interface of FUSE + * + * Protocol changelog: + * + * 7.9: + * - new fuse_getattr_in input argument of GETATTR + * - add lk_flags in fuse_lk_in + * - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in + * - add blksize field to fuse_attr + * - add file flags field to fuse_read_in and fuse_write_in + * + * 7.10 + * - add nonseekable open flag + * + * 7.11 + * - add IOCTL message + * - add unsolicited notification support + * + * 7.12 + * - add umask flag to input argument of open, mknod and mkdir + * - add notification messages for invalidation of inodes and + * directory entries + * + * 7.13 + * - make max number of background requests and congestion threshold + * tunables + * + * 7.14 + * - add splice support to fuse device + * + * 7.15 + * - add store notify + * - add retrieve notify + * + * 7.16 + * - add BATCH_FORGET request + * - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct + * fuse_ioctl_iovec' instead of ambiguous 'struct iovec' + * - add FUSE_IOCTL_32BIT flag + * + * 7.17 + * - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK + * + * 7.18 + * - add FUSE_IOCTL_DIR flag + * - add FUSE_NOTIFY_DELETE + * + * 7.19 + * - add FUSE_FALLOCATE + * + * 7.20 + * - add FUSE_AUTO_INVAL_DATA + * 7.21 + * - add FUSE_READDIRPLUS + * - send the requested events in POLL request + * + * 7.22 + * - add FUSE_ASYNC_DIO + * + * 7.23 + * - add FUSE_WRITEBACK_CACHE + * - add time_gran to fuse_init_out + * - add reserved space to fuse_init_out + * - add FATTR_CTIME + * - add ctime and ctimensec to fuse_setattr_in + * - add FUSE_RENAME2 request + * - add FUSE_NO_OPEN_SUPPORT flag + */ + +#ifndef _FUSE_FUSE_KERNEL_H +#define _FUSE_FUSE_KERNEL_H + +#ifdef __linux__ +#include #else -#include -#include +#include #endif /** Version number of this interface */ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 8 +#define FUSE_KERNEL_MINOR_VERSION 23 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 -/** The major number of the fuse character device */ -#define FUSE_MAJOR MISC_MAJOR - -/** The minor number of the fuse character device */ -#define FUSE_MINOR 229 - /* Make sure all structures are padded to 64bit boundary, so 32bit userspace works under 64bit kernels */ struct fuse_attr { - __u64 ino; - __u64 size; - __u64 blocks; - __u64 atime; - __u64 mtime; - __u64 ctime; - __u32 atimensec; - __u32 mtimensec; - __u32 ctimensec; - __u32 mode; - __u32 nlink; - __u32 uid; - __u32 gid; - __u32 rdev; + uint64_t ino; + uint64_t size; + uint64_t blocks; + uint64_t atime; + uint64_t mtime; + uint64_t ctime; + uint32_t atimensec; + uint32_t mtimensec; + uint32_t ctimensec; + uint32_t mode; + uint32_t nlink; + uint32_t uid; + uint32_t gid; + uint32_t rdev; + uint32_t blksize; + uint32_t padding; }; struct fuse_kstatfs { - __u64 blocks; - __u64 bfree; - __u64 bavail; - __u64 files; - __u64 ffree; - __u32 bsize; - __u32 namelen; - __u32 frsize; - __u32 padding; - __u32 spare[6]; + uint64_t blocks; + uint64_t bfree; + uint64_t bavail; + uint64_t files; + uint64_t ffree; + uint32_t bsize; + uint32_t namelen; + uint32_t frsize; + uint32_t padding; + uint32_t spare[6]; }; struct fuse_file_lock { - __u64 start; - __u64 end; - __u32 type; - __u32 pid; /* tgid */ + uint64_t start; + uint64_t end; + uint32_t type; + uint32_t pid; /* tgid */ }; /** @@ -109,27 +174,128 @@ #define FATTR_ATIME (1 << 4) #define FATTR_MTIME (1 << 5) #define FATTR_FH (1 << 6) +#define FATTR_ATIME_NOW (1 << 7) +#define FATTR_MTIME_NOW (1 << 8) +#define FATTR_LOCKOWNER (1 << 9) +#define FATTR_CTIME (1 << 10) /** * Flags returned by the OPEN request * * FOPEN_DIRECT_IO: bypass page cache for this open file * FOPEN_KEEP_CACHE: don't invalidate the data cache on open + * FOPEN_NONSEEKABLE: the file is not seekable */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) +#define FOPEN_NONSEEKABLE (1 << 2) /** * INIT request/reply flags + * + * FUSE_ASYNC_READ: asynchronous read requests + * FUSE_POSIX_LOCKS: remote locking for POSIX file locks + * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported) + * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem + * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." + * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB + * FUSE_DONT_MASK: don't apply umask to file mode on create operations + * FUSE_SPLICE_WRITE: kernel supports splice write on the device + * FUSE_SPLICE_MOVE: kernel supports splice move on the device + * FUSE_SPLICE_READ: kernel supports splice read on the device + * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks + * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories + * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages + * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one) + * FUSE_READDIRPLUS_AUTO: adaptive readdirplus + * FUSE_ASYNC_DIO: asynchronous direct I/O submission + * FUSE_WRITEBACK_CACHE: use writeback cache for buffered writes + * FUSE_NO_OPEN_SUPPORT: kernel supports zero-message opens */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) +#define FUSE_FILE_OPS (1 << 2) +#define FUSE_ATOMIC_O_TRUNC (1 << 3) +#define FUSE_EXPORT_SUPPORT (1 << 4) +#define FUSE_BIG_WRITES (1 << 5) +#define FUSE_DONT_MASK (1 << 6) +#define FUSE_SPLICE_WRITE (1 << 7) +#define FUSE_SPLICE_MOVE (1 << 8) +#define FUSE_SPLICE_READ (1 << 9) +#define FUSE_FLOCK_LOCKS (1 << 10) +#define FUSE_HAS_IOCTL_DIR (1 << 11) +#define FUSE_AUTO_INVAL_DATA (1 << 12) +#define FUSE_DO_READDIRPLUS (1 << 13) +#define FUSE_READDIRPLUS_AUTO (1 << 14) +#define FUSE_ASYNC_DIO (1 << 15) +#define FUSE_WRITEBACK_CACHE (1 << 16) +#define FUSE_NO_OPEN_SUPPORT (1 << 17) +#ifdef linux /** + * CUSE INIT request/reply flags + * + * CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl + */ +#define CUSE_UNRESTRICTED_IOCTL (1 << 0) +#endif /* linux */ + +/** * Release flags */ #define FUSE_RELEASE_FLUSH (1 << 0) +#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1) +/** + * Getattr flags + */ +#define FUSE_GETATTR_FH (1 << 0) + +/** + * Lock flags + */ +#define FUSE_LK_FLOCK (1 << 0) + +/** + * WRITE flags + * + * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed + * FUSE_WRITE_LOCKOWNER: lock_owner field is valid + */ +#define FUSE_WRITE_CACHE (1 << 0) +#define FUSE_WRITE_LOCKOWNER (1 << 1) + +/** + * Read flags + */ +#define FUSE_READ_LOCKOWNER (1 << 1) + +/** + * Ioctl flags + * + * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine + * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed + * FUSE_IOCTL_RETRY: retry with new iovecs + * FUSE_IOCTL_32BIT: 32bit ioctl + * FUSE_IOCTL_DIR: is a directory + * + * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs + */ +#define FUSE_IOCTL_COMPAT (1 << 0) +#define FUSE_IOCTL_UNRESTRICTED (1 << 1) +#define FUSE_IOCTL_RETRY (1 << 2) +#define FUSE_IOCTL_32BIT (1 << 3) +#define FUSE_IOCTL_DIR (1 << 4) + +#define FUSE_IOCTL_MAX_IOV 256 + +/** + * Poll flags + * + * FUSE_POLL_SCHEDULE_NOTIFY: request poll notify + */ +#define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0) + enum fuse_opcode { FUSE_LOOKUP = 1, FUSE_FORGET = 2, /* no reply */ @@ -167,107 +333,179 @@ FUSE_INTERRUPT = 36, FUSE_BMAP = 37, FUSE_DESTROY = 38, + FUSE_IOCTL = 39, + FUSE_POLL = 40, + FUSE_NOTIFY_REPLY = 41, + FUSE_BATCH_FORGET = 42, + FUSE_FALLOCATE = 43, + FUSE_READDIRPLUS = 44, + FUSE_RENAME2 = 45, + +#ifdef linux + /* CUSE specific operations */ + CUSE_INIT = 4096, +#endif /* linux */ }; +enum fuse_notify_code { + FUSE_NOTIFY_POLL = 1, + FUSE_NOTIFY_INVAL_INODE = 2, + FUSE_NOTIFY_INVAL_ENTRY = 3, + FUSE_NOTIFY_STORE = 4, + FUSE_NOTIFY_RETRIEVE = 5, + FUSE_NOTIFY_DELETE = 6, + FUSE_NOTIFY_CODE_MAX, +}; + /* The read buffer is required to be at least 8k, but may be much larger */ #define FUSE_MIN_READ_BUFFER 8192 +#define FUSE_COMPAT_ENTRY_OUT_SIZE 120 + struct fuse_entry_out { - __u64 nodeid; /* Inode ID */ - __u64 generation; /* Inode generation: nodeid:gen must - be unique for the fs's lifetime */ - __u64 entry_valid; /* Cache timeout for the name */ - __u64 attr_valid; /* Cache timeout for the attributes */ - __u32 entry_valid_nsec; - __u32 attr_valid_nsec; + uint64_t nodeid; /* Inode ID */ + uint64_t generation; /* Inode generation: nodeid:gen must + be unique for the fs's lifetime */ + uint64_t entry_valid; /* Cache timeout for the name */ + uint64_t attr_valid; /* Cache timeout for the attributes */ + uint32_t entry_valid_nsec; + uint32_t attr_valid_nsec; struct fuse_attr attr; }; struct fuse_forget_in { - __u64 nlookup; + uint64_t nlookup; }; +struct fuse_forget_one { + uint64_t nodeid; + uint64_t nlookup; +}; + +struct fuse_batch_forget_in { + uint32_t count; + uint32_t dummy; +}; + +struct fuse_getattr_in { + uint32_t getattr_flags; + uint32_t dummy; + uint64_t fh; +}; + +#define FUSE_COMPAT_ATTR_OUT_SIZE 96 + struct fuse_attr_out { - __u64 attr_valid; /* Cache timeout for the attributes */ - __u32 attr_valid_nsec; - __u32 dummy; + uint64_t attr_valid; /* Cache timeout for the attributes */ + uint32_t attr_valid_nsec; + uint32_t dummy; struct fuse_attr attr; }; +#define FUSE_COMPAT_MKNOD_IN_SIZE 8 + +struct fuse_mknod_in { + uint32_t mode; + uint32_t rdev; + uint32_t umask; + uint32_t padding; +}; + struct fuse_mkdir_in { - __u32 mode; - __u32 padding; + uint32_t mode; + uint32_t umask; }; struct fuse_rename_in { - __u64 newdir; + uint64_t newdir; }; +struct fuse_rename2_in { + uint64_t newdir; + uint32_t flags; + uint32_t padding; +}; + struct fuse_link_in { - __u64 oldnodeid; + uint64_t oldnodeid; }; struct fuse_setattr_in { - __u32 valid; - __u32 padding; - __u64 fh; - __u64 size; - __u64 unused1; - __u64 atime; - __u64 mtime; - __u64 unused2; - __u32 atimensec; - __u32 mtimensec; - __u32 unused3; - __u32 mode; - __u32 unused4; - __u32 uid; - __u32 gid; - __u32 unused5; + uint32_t valid; + uint32_t padding; + uint64_t fh; + uint64_t size; + uint64_t lock_owner; + uint64_t atime; + uint64_t mtime; + uint64_t ctime; + uint32_t atimensec; + uint32_t mtimensec; + uint32_t ctimensec; + uint32_t mode; + uint32_t unused4; + uint32_t uid; + uint32_t gid; + uint32_t unused5; }; struct fuse_open_in { - __u32 flags; - __u32 mode; + uint32_t flags; + uint32_t unused; }; +struct fuse_create_in { + uint32_t flags; + uint32_t mode; + uint32_t umask; + uint32_t padding; +}; + struct fuse_open_out { - __u64 fh; - __u32 open_flags; - __u32 padding; + uint64_t fh; + uint32_t open_flags; + uint32_t padding; }; struct fuse_release_in { - __u64 fh; - __u32 flags; - __u32 release_flags; - __u64 lock_owner; + uint64_t fh; + uint32_t flags; + uint32_t release_flags; + uint64_t lock_owner; }; struct fuse_flush_in { - __u64 fh; - __u32 unused; - __u32 padding; - __u64 lock_owner; + uint64_t fh; + uint32_t unused; + uint32_t padding; + uint64_t lock_owner; }; struct fuse_read_in { - __u64 fh; - __u64 offset; - __u32 size; - __u32 padding; + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t read_flags; + uint64_t lock_owner; + uint32_t flags; + uint32_t padding; }; +#define FUSE_COMPAT_WRITE_IN_SIZE 24 + struct fuse_write_in { - __u64 fh; - __u64 offset; - __u32 size; - __u32 write_flags; + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t write_flags; + uint64_t lock_owner; + uint32_t flags; + uint32_t padding; }; struct fuse_write_out { - __u32 size; - __u32 padding; + uint32_t size; + uint32_t padding; }; #define FUSE_COMPAT_STATFS_SIZE 48 @@ -277,40 +515,42 @@ }; struct fuse_fsync_in { - __u64 fh; - __u32 fsync_flags; - __u32 padding; + uint64_t fh; + uint32_t fsync_flags; + uint32_t padding; }; +struct fuse_setxattr_in { + uint32_t size; + uint32_t flags; +}; + struct fuse_listxattr_in { - __u32 size; - __u32 flags; + uint32_t size; + uint32_t padding; }; struct fuse_listxattr_out { - __u32 size; - __u32 flags; + uint32_t size; + uint32_t padding; }; struct fuse_getxattr_in { - __u32 size; - __u32 padding; + uint32_t size; + uint32_t padding; }; struct fuse_getxattr_out { - __u32 size; - __u32 padding; + uint32_t size; + uint32_t padding; }; -struct fuse_setxattr_in { - __u32 size; - __u32 flags; -}; - struct fuse_lk_in { - __u64 fh; - __u64 owner; + uint64_t fh; + uint64_t owner; struct fuse_file_lock lk; + uint32_t lk_flags; + uint32_t padding; }; struct fuse_lk_out { @@ -318,66 +558,197 @@ }; struct fuse_access_in { - __u32 mask; - __u32 padding; + uint32_t mask; + uint32_t padding; }; struct fuse_init_in { - __u32 major; - __u32 minor; - __u32 max_readahead; - __u32 flags; + uint32_t major; + uint32_t minor; + uint32_t max_readahead; + uint32_t flags; }; +#define FUSE_COMPAT_INIT_OUT_SIZE 8 +#define FUSE_COMPAT_22_INIT_OUT_SIZE 24 + struct fuse_init_out { - __u32 major; - __u32 minor; - __u32 max_readahead; - __u32 flags; - __u32 unused; - __u32 max_write; + uint32_t major; + uint32_t minor; + uint32_t max_readahead; + uint32_t flags; + uint16_t max_background; + uint16_t congestion_threshold; + uint32_t max_write; + uint32_t time_gran; + uint32_t unused[9]; }; +#ifdef linux +#define CUSE_INIT_INFO_MAX 4096 + +struct cuse_init_in { + uint32_t major; + uint32_t minor; + uint32_t unused; + uint32_t flags; +}; + +struct cuse_init_out { + uint32_t major; + uint32_t minor; + uint32_t unused; + uint32_t flags; + uint32_t max_read; + uint32_t max_write; + uint32_t dev_major; /* chardev major */ + uint32_t dev_minor; /* chardev minor */ + uint32_t spare[10]; +}; +#endif /* linux */ + struct fuse_interrupt_in { - __u64 unique; + uint64_t unique; }; struct fuse_bmap_in { - __u64 block; - __u32 blocksize; - __u32 padding; + uint64_t block; + uint32_t blocksize; + uint32_t padding; }; struct fuse_bmap_out { - __u64 block; + uint64_t block; }; +struct fuse_ioctl_in { + uint64_t fh; + uint32_t flags; + uint32_t cmd; + uint64_t arg; + uint32_t in_size; + uint32_t out_size; +}; + +struct fuse_ioctl_iovec { + uint64_t base; + uint64_t len; +}; + +struct fuse_ioctl_out { + int32_t result; + uint32_t flags; + uint32_t in_iovs; + uint32_t out_iovs; +}; + +struct fuse_poll_in { + uint64_t fh; + uint64_t kh; + uint32_t flags; + uint32_t events; +}; + +struct fuse_poll_out { + uint32_t revents; + uint32_t padding; +}; + +struct fuse_notify_poll_wakeup_out { + uint64_t kh; +}; + +struct fuse_fallocate_in { + uint64_t fh; + uint64_t offset; + uint64_t length; + uint32_t mode; + uint32_t padding; +}; + struct fuse_in_header { - __u32 len; - __u32 opcode; - __u64 unique; - __u64 nodeid; - __u32 uid; - __u32 gid; - __u32 pid; - __u32 padding; + uint32_t len; + uint32_t opcode; + uint64_t unique; + uint64_t nodeid; + uint32_t uid; + uint32_t gid; + uint32_t pid; + uint32_t padding; }; struct fuse_out_header { - __u32 len; - __s32 error; - __u64 unique; + uint32_t len; + int32_t error; + uint64_t unique; }; struct fuse_dirent { - __u64 ino; - __u64 off; - __u32 namelen; - __u32 type; - char name[0]; + uint64_t ino; + uint64_t off; + uint32_t namelen; + uint32_t type; + char name[]; }; #define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) -#define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1)) +#define FUSE_DIRENT_ALIGN(x) \ + (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1)) #define FUSE_DIRENT_SIZE(d) \ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) + +struct fuse_direntplus { + struct fuse_entry_out entry_out; + struct fuse_dirent dirent; +}; + +#define FUSE_NAME_OFFSET_DIRENTPLUS \ + offsetof(struct fuse_direntplus, dirent.name) +#define FUSE_DIRENTPLUS_SIZE(d) \ + FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen) + +struct fuse_notify_inval_inode_out { + uint64_t ino; + int64_t off; + int64_t len; +}; + +struct fuse_notify_inval_entry_out { + uint64_t parent; + uint32_t namelen; + uint32_t padding; +}; + +struct fuse_notify_delete_out { + uint64_t parent; + uint64_t child; + uint32_t namelen; + uint32_t padding; +}; + +struct fuse_notify_store_out { + uint64_t nodeid; + uint64_t offset; + uint32_t size; + uint32_t padding; +}; + +struct fuse_notify_retrieve_out { + uint64_t notify_unique; + uint64_t nodeid; + uint64_t offset; + uint32_t size; + uint32_t padding; +}; + +/* Matches the size of fuse_write_in */ +struct fuse_notify_retrieve_in { + uint64_t dummy1; + uint64_t offset; + uint32_t size; + uint32_t dummy2; + uint64_t dummy3; + uint64_t dummy4; +}; + +#endif /* _FUSE_FUSE_KERNEL_H */ Index: sys/fs/fuse/fuse_main.c =================================================================== --- sys/fs/fuse/fuse_main.c +++ sys/fs/fuse/fuse_main.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -77,6 +82,10 @@ #include #include "fuse.h" +#include "fuse_file.h" +#include "fuse_ipc.h" +#include "fuse_internal.h" +#include "fuse_node.h" static void fuse_bringdown(eventhandler_tag eh_tag); static int fuse_loader(struct module *m, int what, void *arg); @@ -85,6 +94,7 @@ extern struct vfsops fuse_vfsops; extern struct cdevsw fuse_cdevsw; +extern struct vop_vector fuse_fifonops; extern struct vop_vector fuse_vnops; extern uma_zone_t fuse_pbuf_zone; @@ -96,11 +106,13 @@ .vfc_flags = VFCF_JAIL | VFCF_SYNTHETIC }; +SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW, 0, "FUSE tunables"); +SYSCTL_NODE(_vfs_fusefs, OID_AUTO, stats, CTLFLAG_RW, 0, "FUSE statistics"); SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_major, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FUSE_KERNEL_VERSION, "FUSE kernel abi major version"); SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_minor, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FUSE_KERNEL_MINOR_VERSION, "FUSE kernel abi minor version"); -SDT_PROVIDER_DEFINE(fuse); +SDT_PROVIDER_DEFINE(fusefs); /****************************** * @@ -111,7 +123,9 @@ static void fuse_bringdown(eventhandler_tag eh_tag) { - + fuse_node_destroy(); + fuse_internal_destroy(); + fuse_file_destroy(); fuse_ipc_destroy(); fuse_device_destroy(); mtx_destroy(&fuse_mtx); @@ -132,16 +146,14 @@ return (err); } fuse_ipc_init(); + fuse_file_init(); + fuse_internal_init(); + fuse_node_init(); fuse_pbuf_zone = pbuf_zsecond_create("fusepbuf", nswbuf / 2); /* vfs_modevent ignores its first arg */ if ((err = vfs_modevent(NULL, what, &fuse_vfsconf))) fuse_bringdown(eh_tag); - else - printf("fuse-freebsd: version %s, FUSE ABI %d.%d\n", - FUSE_FREEBSD_VERSION, - FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); - break; case MOD_UNLOAD: if ((err = vfs_modevent(NULL, what, &fuse_vfsconf))) Index: sys/fs/fuse/fuse_node.h =================================================================== --- sys/fs/fuse/fuse_node.h +++ sys/fs/fuse/fuse_node.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -60,60 +65,121 @@ #ifndef _FUSE_NODE_H_ #define _FUSE_NODE_H_ +#include #include #include #include "fuse_file.h" -#define FN_REVOKED 0x00000020 -#define FN_FLUSHINPROG 0x00000040 -#define FN_FLUSHWANT 0x00000080 -#define FN_SIZECHANGE 0x00000100 -#define FN_DIRECTIO 0x00000200 +#define FN_REVOKED 0x00000020 +#define FN_FLUSHINPROG 0x00000040 +#define FN_FLUSHWANT 0x00000080 +/* + * Indicates that the file's size is dirty; the kernel has changed it but not + * yet send the change to the daemon. When this bit is set, the + * cache_attrs.va_size field does not time out. + */ +#define FN_SIZECHANGE 0x00000100 +#define FN_DIRECTIO 0x00000200 +/* Indicates that parent_nid is valid */ +#define FN_PARENT_NID 0x00000400 +/* + * Indicates that the file's cached timestamps are dirty. They will be flushed + * during the next SETATTR or WRITE. Until then, the cached fields will not + * time out. + */ +#define FN_MTIMECHANGE 0x00000800 +#define FN_CTIMECHANGE 0x00001000 + struct fuse_vnode_data { /** self **/ uint64_t nid; + uint64_t generation; /** parent **/ - /* XXXIP very likely to be stale, it's not updated in rename() */ uint64_t parent_nid; /** I/O **/ - struct fuse_filehandle fufh[FUFH_MAXTYPE]; + /* List of file handles for all of the vnode's open file descriptors */ + LIST_HEAD(, fuse_filehandle) handles; /** flags **/ uint32_t flag; /** meta **/ - bool valid_attr_cache; + /* The monotonic time after which the attr cache is invalid */ + struct bintime attr_cache_timeout; + /* + * Monotonic time after which the entry is invalid. Used for lookups + * by nodeid instead of pathname. + */ + struct bintime entry_cache_timeout; struct vattr cached_attrs; - off_t filesize; uint64_t nlookup; enum vtype vtype; }; +/* + * This overlays the fid structure (see mount.h). Mostly the same as the types + * used by UFS and ext2. + */ +struct fuse_fid { + uint16_t len; /* Length of structure. */ + uint16_t pad; /* Force 32-bit alignment. */ + uint32_t gen; /* Generation number. */ + uint64_t nid; /* FUSE node id. */ +}; + #define VTOFUD(vp) \ ((struct fuse_vnode_data *)((vp)->v_data)) #define VTOI(vp) (VTOFUD(vp)->nid) -#define VTOVA(vp) \ - (VTOFUD(vp)->valid_attr_cache ? \ - &(VTOFUD(vp)->cached_attrs) : NULL) +static inline struct vattr* +VTOVA(struct vnode *vp) +{ + struct bintime now; + + getbinuptime(&now); + if (bintime_cmp(&(VTOFUD(vp)->attr_cache_timeout), &now, >)) + return &(VTOFUD(vp)->cached_attrs); + else + return NULL; +} + +static inline void +fuse_vnode_clear_attr_cache(struct vnode *vp) +{ + bintime_clear(&VTOFUD(vp)->attr_cache_timeout); +} + +static uint32_t inline +fuse_vnode_hash(uint64_t id) +{ + return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT)); +} + #define VTOILLU(vp) ((uint64_t)(VTOFUD(vp) ? VTOI(vp) : 0)) #define FUSE_NULL_ID 0 +extern struct vop_vector fuse_fifoops; extern struct vop_vector fuse_vnops; +int fuse_vnode_cmp(struct vnode *vp, void *nidp); + static inline void fuse_vnode_setparent(struct vnode *vp, struct vnode *dvp) { if (dvp != NULL && vp->v_type == VDIR) { MPASS(dvp->v_type == VDIR); VTOFUD(vp)->parent_nid = VTOI(dvp); + VTOFUD(vp)->flag |= FN_PARENT_NID; } } +int fuse_vnode_size(struct vnode *vp, off_t *filesize, struct ucred *cred, + struct thread *td); + void fuse_vnode_destroy(struct vnode *vp); int fuse_vnode_get(struct mount *mp, struct fuse_entry_out *feo, @@ -123,10 +189,14 @@ void fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags, struct thread *td); -void fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred); +int fuse_vnode_savesize(struct vnode *vp, struct ucred *cred, pid_t pid); -int fuse_vnode_savesize(struct vnode *vp, struct ucred *cred); - int fuse_vnode_setsize(struct vnode *vp, off_t newsize); +void fuse_vnode_undirty_cached_timestamps(struct vnode *vp); + +void fuse_vnode_update(struct vnode *vp, int flags); + +void fuse_node_init(void); +void fuse_node_destroy(void); #endif /* _FUSE_NODE_H_ */ Index: sys/fs/fuse/fuse_node.c =================================================================== --- sys/fs/fuse/fuse_node.c +++ sys/fs/fuse/fuse_node.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -59,6 +64,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include #include @@ -77,8 +83,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -89,65 +95,40 @@ #include "fuse_io.h" #include "fuse_ipc.h" -SDT_PROVIDER_DECLARE(fuse); +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , node, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , node, trace, "int", "char*"); MALLOC_DEFINE(M_FUSEVN, "fuse_vnode", "fuse vnode private data"); static int sysctl_fuse_cache_mode(SYSCTL_HANDLER_ARGS); -static int fuse_node_count = 0; +static counter_u64_t fuse_node_count; -SYSCTL_INT(_vfs_fusefs, OID_AUTO, node_count, CTLFLAG_RD, - &fuse_node_count, 0, "Count of FUSE vnodes"); +SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, node_count, CTLFLAG_RD, + &fuse_node_count, "Count of FUSE vnodes"); int fuse_data_cache_mode = FUSE_CACHE_WT; +/* + * DEPRECATED + * This sysctl is no longer needed as of fuse protocol 7.23. Individual + * servers can select the cache behavior they need for each mountpoint: + * - writethrough: the default + * - writeback: set FUSE_WRITEBACK_CACHE in fuse_init_out.flags + * - uncached: set FOPEN_DIRECT_IO for every file + * The sysctl is retained primarily for use by jails supporting older FUSE + * protocols. It may be removed entirely once FreeBSD 11.3 and 12.0 are EOL. + */ SYSCTL_PROC(_vfs_fusefs, OID_AUTO, data_cache_mode, CTLTYPE_INT|CTLFLAG_RW, &fuse_data_cache_mode, 0, sysctl_fuse_cache_mode, "I", "Zero: disable caching of FUSE file data; One: write-through caching " "(default); Two: write-back caching (generally unsafe)"); -int fuse_data_cache_invalidate = 0; - -SYSCTL_INT(_vfs_fusefs, OID_AUTO, data_cache_invalidate, CTLFLAG_RW, - &fuse_data_cache_invalidate, 0, - "If non-zero, discard cached clean file data when there are no active file" - " users"); - -int fuse_mmap_enable = 1; - -SYSCTL_INT(_vfs_fusefs, OID_AUTO, mmap_enable, CTLFLAG_RW, - &fuse_mmap_enable, 0, - "If non-zero, and data_cache_mode is also non-zero, enable mmap(2) of " - "FUSE files"); - -int fuse_refresh_size = 0; - -SYSCTL_INT(_vfs_fusefs, OID_AUTO, refresh_size, CTLFLAG_RW, - &fuse_refresh_size, 0, - "If non-zero, and no dirty file extension data is buffered, fetch file " - "size before write operations"); - -int fuse_sync_resize = 1; - -SYSCTL_INT(_vfs_fusefs, OID_AUTO, sync_resize, CTLFLAG_RW, - &fuse_sync_resize, 0, - "If a cached write extended a file, inform FUSE filesystem of the changed" - "size immediately subsequent to the issued writes"); - -int fuse_fix_broken_io = 0; - -SYSCTL_INT(_vfs_fusefs, OID_AUTO, fix_broken_io, CTLFLAG_RW, - &fuse_fix_broken_io, 0, - "If non-zero, print a diagnostic warning if a userspace filesystem returns" - " EIO on reads of recently extended portions of files"); - static int sysctl_fuse_cache_mode(SYSCTL_HANDLER_ARGS) { @@ -174,9 +155,8 @@ fuse_vnode_init(struct vnode *vp, struct fuse_vnode_data *fvdat, uint64_t nodeid, enum vtype vtyp) { - int i; - fvdat->nid = nodeid; + LIST_INIT(&fvdat->handles); vattr_null(&fvdat->cached_attrs); if (nodeid == FUSE_ROOT_ID) { vp->v_vflag |= VV_ROOT; @@ -184,10 +164,7 @@ vp->v_type = vtyp; vp->v_data = fvdat; - for (i = 0; i < FUFH_MAXTYPE; i++) - fvdat->fufh[i].fh_type = FUFH_INVALID; - - atomic_add_acq_int(&fuse_node_count, 1); + counter_u64_add(fuse_node_count, 1); } void @@ -196,23 +173,21 @@ struct fuse_vnode_data *fvdat = vp->v_data; vp->v_data = NULL; + KASSERT(LIST_EMPTY(&fvdat->handles), + ("Destroying fuse vnode with open files!")); free(fvdat, M_FUSEVN); - atomic_subtract_acq_int(&fuse_node_count, 1); + counter_u64_add(fuse_node_count, -1); } -static int +int fuse_vnode_cmp(struct vnode *vp, void *nidp) { return (VTOI(vp) != *((uint64_t *)nidp)); } -static uint32_t inline -fuse_vnode_hash(uint64_t id) -{ - return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT)); -} - +SDT_PROBE_DEFINE3(fusefs, , node, stale_vnode, "struct vnode*", "enum vtype", + "uint64_t"); static int fuse_vnode_alloc(struct mount *mp, struct thread *td, @@ -220,10 +195,12 @@ enum vtype vtyp, struct vnode **vpp) { + struct fuse_data *data; struct fuse_vnode_data *fvdat; struct vnode *vp2; int err = 0; + data = fuse_get_mpdata(mp); if (vtyp == VNON) { return EINVAL; } @@ -234,12 +211,34 @@ return (err); if (*vpp) { - MPASS((*vpp)->v_type == vtyp && (*vpp)->v_data != NULL); - SDT_PROBE2(fuse, , node, trace, 1, "vnode taken from hash"); + if ((*vpp)->v_type != vtyp) { + /* + * STALE vnode! This probably indicates a buggy + * server, but it could also be the result of a race + * between FUSE_LOOKUP and another client's + * FUSE_UNLINK/FUSE_CREATE + */ + SDT_PROBE3(fusefs, , node, stale_vnode, *vpp, vtyp, + nodeid); + fuse_internal_vnode_disappear(*vpp); + lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL); + *vpp = NULL; + return (EAGAIN); + } + MPASS((*vpp)->v_data != NULL); + MPASS(VTOFUD(*vpp)->nid == nodeid); + SDT_PROBE2(fusefs, , node, trace, 1, "vnode taken from hash"); return (0); } fvdat = malloc(sizeof(*fvdat), M_FUSEVN, M_WAITOK | M_ZERO); - err = getnewvnode("fuse", mp, &fuse_vnops, vpp); + switch (vtyp) { + case VFIFO: + err = getnewvnode("fuse", mp, &fuse_fifoops, vpp); + break; + default: + err = getnewvnode("fuse", mp, &fuse_vnops, vpp); + break; + } if (err) { free(fvdat, M_FUSEVN); return (err); @@ -249,14 +248,23 @@ err = insmntque(*vpp, mp); ASSERT_VOP_ELOCKED(*vpp, "fuse_vnode_alloc"); if (err) { + lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL); free(fvdat, M_FUSEVN); *vpp = NULL; return (err); } + /* Disallow async reads for fifos because UFS does. I don't know why */ + if (data->dataflags & FSESS_ASYNC_READ && vtyp != VFIFO) + VN_LOCK_ASHARE(*vpp); + err = vfs_hash_insert(*vpp, fuse_vnode_hash(nodeid), LK_EXCLUSIVE, td, &vp2, fuse_vnode_cmp, &nodeid); - if (err) + if (err) { + lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL); + free(fvdat, M_FUSEVN); + *vpp = NULL; return (err); + } if (vp2 != NULL) { *vpp = vp2; return (0); @@ -277,6 +285,11 @@ enum vtype vtyp) { struct thread *td = (cnp != NULL ? cnp->cn_thread : curthread); + /* + * feo should only be NULL for the root directory, which (when libfuse + * is used) always has generation 0 + */ + uint64_t generation = feo ? feo->generation : 0; int err = 0; err = fuse_vnode_alloc(mp, td, nodeid, vtyp, vpp); @@ -284,22 +297,28 @@ return err; } if (dvp != NULL) { - MPASS((cnp->cn_flags & ISDOTDOT) == 0); - MPASS(!(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')); + MPASS(cnp && (cnp->cn_flags & ISDOTDOT) == 0); + MPASS(cnp && + !(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')); fuse_vnode_setparent(*vpp, dvp); } if (dvp != NULL && cnp != NULL && (cnp->cn_flags & MAKEENTRY) != 0 && feo != NULL && (feo->entry_valid != 0 || feo->entry_valid_nsec != 0)) { + struct timespec timeout; + ASSERT_VOP_LOCKED(*vpp, "fuse_vnode_get"); ASSERT_VOP_LOCKED(dvp, "fuse_vnode_get"); - cache_enter(dvp, *vpp, cnp); + + fuse_validity_2_timespec(feo, &timeout); + cache_enter_time(dvp, *vpp, cnp, &timeout, NULL); } + VTOFUD(*vpp)->generation = generation; /* * In userland, libfuse uses cached lookups for dot and dotdot entries, * thus it does not really bump the nlookup counter for forget. - * Follow the same semantic and avoid tu bump it in order to keep + * Follow the same semantic and avoid the bump in order to keep * nlookup counters consistent. */ if (cnp == NULL || ((cnp->cn_flags & ISDOTDOT) == 0 && @@ -309,44 +328,19 @@ return 0; } +/* + * Called for every fusefs vnode open to initialize the vnode (not + * fuse_filehandle) for use + */ void fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags, struct thread *td) { - /* - * Funcation is called for every vnode open. - * Merge fuse_open_flags it may be 0 - */ - /* - * Ideally speaking, direct io should be enabled on - * fd's but do not see of any way of providing that - * this implementation. - * - * Also cannot think of a reason why would two - * different fd's on same vnode would like - * have DIRECT_IO turned on and off. But linux - * based implementation works on an fd not an - * inode and provides such a feature. - * - * XXXIP: Handle fd based DIRECT_IO - */ - if (fuse_open_flags & FOPEN_DIRECT_IO) { - ASSERT_VOP_ELOCKED(vp, __func__); - VTOFUD(vp)->flag |= FN_DIRECTIO; - fuse_io_invalbuf(vp, td); - } else { - if ((fuse_open_flags & FOPEN_KEEP_CACHE) == 0) - fuse_io_invalbuf(vp, td); - VTOFUD(vp)->flag &= ~FN_DIRECTIO; - } - - if (vnode_vtype(vp) == VREG) { - /* XXXIP prevent getattr, by using cached node size */ + if (vnode_vtype(vp) == VREG) vnode_create_vobject(vp, 0, td); - } } int -fuse_vnode_savesize(struct vnode *vp, struct ucred *cred) +fuse_vnode_savesize(struct vnode *vp, struct ucred *cred, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct thread *td = curthread; @@ -375,10 +369,11 @@ fsai->valid = 0; /* Truncate to a new value. */ - fsai->size = fvdat->filesize; + MPASS((fvdat->flag & FN_SIZECHANGE) != 0); + fsai->size = fvdat->cached_attrs.va_size; fsai->valid |= FATTR_SIZE; - fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh); + fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); if (fufh) { fsai->fh = fufh->fh_id; fsai->valid |= FATTR_FH; @@ -391,38 +386,116 @@ return err; } -void -fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred) -{ - - struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct vattr va; - - if ((fvdat->flag & FN_SIZECHANGE) != 0 || - fuse_data_cache_mode == FUSE_CACHE_UC || - (fuse_refresh_size == 0 && fvdat->filesize != 0)) - return; - - VOP_GETATTR(vp, &va, cred); - SDT_PROBE2(fuse, , node, trace, 1, "refreshed file size"); -} - +/* + * Adjust the vnode's size to a new value, such as that provided by + * FUSE_GETATTR. + */ int fuse_vnode_setsize(struct vnode *vp, off_t newsize) { struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct vattr *attrs; off_t oldsize; + size_t iosize; + struct buf *bp = NULL; int err = 0; ASSERT_VOP_ELOCKED(vp, "fuse_vnode_setsize"); - oldsize = fvdat->filesize; - fvdat->filesize = newsize; - fvdat->flag |= FN_SIZECHANGE; + iosize = fuse_iosize(vp); + oldsize = fvdat->cached_attrs.va_size; + fvdat->cached_attrs.va_size = newsize; + if ((attrs = VTOVA(vp)) != NULL) + attrs->va_size = newsize; if (newsize < oldsize) { + daddr_t lbn; + err = vtruncbuf(vp, newsize, fuse_iosize(vp)); + if (err) + goto out; + if (newsize % iosize == 0) + goto out; + /* + * Zero the contents of the last partial block. + * Sure seems like vtruncbuf should do this for us. + */ + + lbn = newsize / iosize; + bp = getblk(vp, lbn, iosize, PCATCH, 0, 0); + if (!bp) { + err = EINTR; + goto out; + } + if (!(bp->b_flags & B_CACHE)) + goto out; /* Nothing to do */ + MPASS(bp->b_flags & B_VMIO); + vfs_bio_clrbuf(bp); + bp->b_dirtyend = MIN(bp->b_dirtyend, newsize - lbn * iosize); } +out: + if (bp) + brelse(bp); vnode_pager_setsize(vp, newsize); return err; +} + +/* Get the current, possibly dirty, size of the file */ +int +fuse_vnode_size(struct vnode *vp, off_t *filesize, struct ucred *cred, + struct thread *td) +{ + struct fuse_vnode_data *fvdat = VTOFUD(vp); + int error = 0; + + if (!(fvdat->flag & FN_SIZECHANGE) && + (VTOVA(vp) == NULL || fvdat->cached_attrs.va_size == VNOVAL)) + error = fuse_internal_do_getattr(vp, NULL, cred, td); + + if (!error) + *filesize = fvdat->cached_attrs.va_size; + + return error; +} + +void +fuse_vnode_undirty_cached_timestamps(struct vnode *vp) +{ + struct fuse_vnode_data *fvdat = VTOFUD(vp); + + fvdat->flag &= ~(FN_MTIMECHANGE | FN_CTIMECHANGE); +} + +/* Update a fuse file's cached timestamps */ +void +fuse_vnode_update(struct vnode *vp, int flags) +{ + struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); + struct timespec ts; + + vfs_timestamp(&ts); + + if (data->time_gran > 1) + ts.tv_nsec = rounddown(ts.tv_nsec, data->time_gran); + + if (flags & FN_MTIMECHANGE) + fvdat->cached_attrs.va_mtime = ts; + if (flags & FN_CTIMECHANGE) + fvdat->cached_attrs.va_ctime = ts; + + fvdat->flag |= flags; +} + +void +fuse_node_init(void) +{ + fuse_node_count = counter_u64_alloc(M_WAITOK); + counter_u64_zero(fuse_node_count); +} + +void +fuse_node_destroy(void) +{ + counter_u64_free(fuse_node_count); } Index: sys/fs/fuse/fuse_param.h =================================================================== --- sys/fs/fuse/fuse_param.h +++ /dev/null @@ -1,82 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-3-Clause - * - * Copyright (c) 2007-2009 Google Inc. and Amit Singh - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Google Inc. nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _FUSE_PARAM_H_ -#define _FUSE_PARAM_H_ - -/* - * This is the prefix ("fuse" by default) of the name of a FUSE device node - * in devfs. The suffix is the device number. "/dev/fuse0" is the first FUSE - * device by default. If you change the prefix from the default to something - * else, the user-space FUSE library will need to know about it too. - */ -#define FUSE_DEVICE_BASENAME "fuse" - -/* - * This is the number of /dev/fuse nodes we will create. goes from - * 0 to (FUSE_NDEVICES - 1). - */ -#define FUSE_NDEVICES 16 - -/* - * This is the default block size of the virtual storage devices that are - * implicitly implemented by the FUSE kernel extension. This can be changed - * on a per-mount basis (there's one such virtual device for each mount). - */ -#define FUSE_DEFAULT_BLOCKSIZE 4096 - -/* - * This is default I/O size used while accessing the virtual storage devices. - * This can be changed on a per-mount basis. - */ -#define FUSE_DEFAULT_IOSIZE 4096 - -#ifdef KERNEL - -/* - * This is the soft upper limit on the number of "request tickets" FUSE's - * user-kernel IPC layer can have for a given mount. This can be modified - * through the fuse.* sysctl interface. - */ -#define FUSE_DEFAULT_MAX_FREE_TICKETS 1024 - -#define FUSE_DEFAULT_IOV_PERMANENT_BUFSIZE (1L << 19) -#define FUSE_DEFAULT_IOV_CREDIT 16 - -#endif - -#define FUSE_LINK_MAX UINT32_MAX - -#endif /* _FUSE_PARAM_H_ */ Index: sys/fs/fuse/fuse_vfsops.c =================================================================== --- sys/fs/fuse/fuse_vfsops.c +++ sys/fs/fuse/fuse_vfsops.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -81,7 +86,6 @@ #include #include "fuse.h" -#include "fuse_param.h" #include "fuse_node.h" #include "fuse_ipc.h" #include "fuse_internal.h" @@ -89,13 +93,13 @@ #include #include -SDT_PROVIDER_DECLARE(fuse); +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , vfsops, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , vfsops, trace, "int", "char*"); /* This will do for privilege types for now */ #ifndef PRIV_VFS_FUSE_ALLOWOTHER @@ -108,30 +112,28 @@ #define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER #endif +static vfs_fhtovp_t fuse_vfsop_fhtovp; static vfs_mount_t fuse_vfsop_mount; static vfs_unmount_t fuse_vfsop_unmount; static vfs_root_t fuse_vfsop_root; static vfs_statfs_t fuse_vfsop_statfs; +static vfs_vget_t fuse_vfsop_vget; struct vfsops fuse_vfsops = { + .vfs_fhtovp = fuse_vfsop_fhtovp, .vfs_mount = fuse_vfsop_mount, .vfs_unmount = fuse_vfsop_unmount, .vfs_root = fuse_vfsop_root, .vfs_statfs = fuse_vfsop_statfs, + .vfs_vget = fuse_vfsop_vget, }; -SYSCTL_INT(_vfs_fusefs, OID_AUTO, init_backgrounded, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, 1, "indicate async handshake"); static int fuse_enforce_dev_perms = 0; SYSCTL_INT(_vfs_fusefs, OID_AUTO, enforce_dev_perms, CTLFLAG_RW, &fuse_enforce_dev_perms, 0, "enforce fuse device permissions for secondary mounts"); -static unsigned sync_unmount = 1; -SYSCTL_UINT(_vfs_fusefs, OID_AUTO, sync_unmount, CTLFLAG_RW, - &sync_unmount, 0, "specify when to use synchronous unmount"); - MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer"); static int @@ -208,11 +210,90 @@ vfs_flagopt(opts, "__" #fnam, &__mntopts, fval); \ } while (0) -SDT_PROBE_DEFINE1(fuse, , vfsops, mntopts, "uint64_t"); -SDT_PROBE_DEFINE4(fuse, , vfsops, mount_err, "char*", "struct fuse_data*", +SDT_PROBE_DEFINE1(fusefs, , vfsops, mntopts, "uint64_t"); +SDT_PROBE_DEFINE4(fusefs, , vfsops, mount_err, "char*", "struct fuse_data*", "struct mount*", "int"); static int +fuse_vfs_remount(struct mount *mp, struct thread *td, uint64_t mntopts, + uint32_t max_read, int daemon_timeout) +{ + int err = 0; + struct fuse_data *data = fuse_get_mpdata(mp); + /* Don't allow these options to be changed */ + const static unsigned long long cant_update_opts = + MNT_USER; /* Mount owner must be the user running the daemon */ + + FUSE_LOCK(); + + if ((mp->mnt_flag ^ data->mnt_flag) & cant_update_opts) { + err = EOPNOTSUPP; + SDT_PROBE4(fusefs, , vfsops, mount_err, + "Can't change these mount options during remount", + data, mp, err); + goto out; + } + if (((data->dataflags ^ mntopts) & FSESS_MNTOPTS_MASK) || + (data->max_read != max_read) || + (data->daemon_timeout != daemon_timeout)) { + // TODO: allow changing options where it makes sense + err = EOPNOTSUPP; + SDT_PROBE4(fusefs, , vfsops, mount_err, + "Can't change fuse mount options during remount", + data, mp, err); + goto out; + } + + if (fdata_get_dead(data)) { + err = ENOTCONN; + SDT_PROBE4(fusefs, , vfsops, mount_err, + "device is dead during mount", data, mp, err); + goto out; + } + + /* Sanity + permission checks */ + if (!data->daemoncred) + panic("fuse daemon found, but identity unknown"); + if (mntopts & FSESS_DAEMON_CAN_SPY) + err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER); + if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid) + /* are we allowed to do the first mount? */ + err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER); + +out: + FUSE_UNLOCK(); + return err; +} + +static int +fuse_vfsop_fhtovp(struct mount *mp, struct fid *fhp, int flags, + struct vnode **vpp) +{ + struct fuse_fid *ffhp = (struct fuse_fid *)fhp; + struct fuse_vnode_data *fvdat; + struct vnode *nvp; + int error; + + if (!(fuse_get_mpdata(mp)->dataflags & FSESS_EXPORT_SUPPORT)) + return EOPNOTSUPP; + + error = VFS_VGET(mp, ffhp->nid, LK_EXCLUSIVE, &nvp); + if (error) { + *vpp = NULLVP; + return (error); + } + fvdat = VTOFUD(nvp); + if (fvdat->generation != ffhp->gen ) { + vput(nvp); + *vpp = NULLVP; + return (ESTALE); + } + *vpp = nvp; + vnode_create_vobject(*vpp, 0, curthread); + return (0); +} + +static int fuse_vfsop_mount(struct mount *mp) { int err; @@ -238,13 +319,6 @@ __mntopts = 0; td = curthread; - if (mp->mnt_flag & MNT_UPDATE) - return EOPNOTSUPP; - - MNT_ILOCK(mp); - mp->mnt_flag |= MNT_SYNCHRONOUS; - mp->mnt_data = NULL; - MNT_IUNLOCK(mp); /* Get the new options passed to mount */ opts = mp->mnt_optnew; @@ -255,19 +329,6 @@ if (!vfs_getopts(opts, "fspath", &err)) return err; - /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */ - fspec = vfs_getopts(opts, "from", &err); - if (!fspec) - return err; - - /* `fd' contains the filedescriptor for this session; REQUIRED */ - if (vfs_scanopt(opts, "fd", "%d", &fd) != 1) - return EINVAL; - - err = fuse_getdevice(fspec, td, &fdev); - if (err != 0) - return err; - /* * With the help of underscored options the mount program * can inform us from the flags it sets by default @@ -275,12 +336,6 @@ FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY); FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN); FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS); - FUSE_FLAGOPT(no_attrcache, FSESS_NO_ATTRCACHE); - FUSE_FLAGOPT(no_readahed, FSESS_NO_READAHEAD); - FUSE_FLAGOPT(no_datacache, FSESS_NO_DATACACHE); - FUSE_FLAGOPT(no_namecache, FSESS_NO_NAMECACHE); - FUSE_FLAGOPT(no_mmap, FSESS_NO_MMAP); - FUSE_FLAGOPT(brokenio, FSESS_BROKENIO); (void)vfs_scanopt(opts, "max_read=", "%u", &max_read); if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) { @@ -293,11 +348,29 @@ } subtype = vfs_getopts(opts, "subtype=", &err); - SDT_PROBE1(fuse, , vfsops, mntopts, mntopts); + SDT_PROBE1(fusefs, , vfsops, mntopts, mntopts); + if (mp->mnt_flag & MNT_UPDATE) { + return fuse_vfs_remount(mp, td, mntopts, max_read, + daemon_timeout); + } + + /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */ + fspec = vfs_getopts(opts, "from", &err); + if (!fspec) + return err; + + /* `fd' contains the filedescriptor for this session; REQUIRED */ + if (vfs_scanopt(opts, "fd", "%d", &fd) != 1) + return EINVAL; + + err = fuse_getdevice(fspec, td, &fdev); + if (err != 0) + return err; + err = fget(td, fd, &cap_read_rights, &fp); if (err != 0) { - SDT_PROBE2(fuse, , vfsops, trace, 1, + SDT_PROBE2(fusefs, , vfsops, trace, 1, "invalid or not opened device"); goto out; } @@ -307,16 +380,17 @@ td->td_fpop = fptmp; fdrop(fp, td); FUSE_LOCK(); - if (err != 0 || data == NULL || data->mp != NULL) { + + if (err != 0 || data == NULL) { err = ENXIO; - SDT_PROBE4(fuse, , vfsops, mount_err, + SDT_PROBE4(fusefs, , vfsops, mount_err, "invalid or not opened device", data, mp, err); FUSE_UNLOCK(); goto out; } if (fdata_get_dead(data)) { err = ENOTCONN; - SDT_PROBE4(fuse, , vfsops, mount_err, + SDT_PROBE4(fusefs, , vfsops, mount_err, "device is dead during mount", data, mp, err); FUSE_UNLOCK(); goto out; @@ -338,12 +412,17 @@ data->dataflags |= mntopts; data->max_read = max_read; data->daemon_timeout = daemon_timeout; + data->mnt_flag = mp->mnt_flag & MNT_UPDATEMASK; FUSE_UNLOCK(); vfs_getnewfsid(mp); MNT_ILOCK(mp); mp->mnt_data = data; - mp->mnt_flag |= MNT_LOCAL; + /* + * FUSE file systems can be either local or remote, but the kernel + * can't tell the difference. + */ + mp->mnt_flag &= ~MNT_LOCAL; mp->mnt_kern_flag |= MNTK_USES_BCACHE; MNT_IUNLOCK(mp); /* We need this here as this slot is used by getnewvnode() */ @@ -354,6 +433,7 @@ } copystr(fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &len); bzero(mp->mnt_stat.f_mntfromname + len, MNAMELEN - len); + mp->mnt_iosize_max = MAXPHYS; /* Now handshaking with daemon */ fuse_internal_send_init(data, td); @@ -366,9 +446,10 @@ * Destroy device only if we acquired reference to * it */ - SDT_PROBE4(fuse, , vfsops, mount_err, + SDT_PROBE4(fusefs, , vfsops, mount_err, "mount failed, destroy device", data, mp, err); data->mp = NULL; + mp->mnt_data = NULL; fdata_trydestroy(data); } FUSE_UNLOCK(); @@ -412,11 +493,13 @@ if (fdata_get_dead(data)) { goto alreadydead; } - fdisp_init(&fdi, 0); - fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL); + if (fsess_isimpl(mp, FUSE_DESTROY)) { + fdisp_init(&fdi, 0); + fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL); - err = fdisp_wait_answ(&fdi); - fdisp_destroy(&fdi); + (void)fdisp_wait_answ(&fdi); + fdisp_destroy(&fdi); + } fdata_set_dead(data); @@ -429,7 +512,6 @@ MNT_ILOCK(mp); mp->mnt_data = NULL; - mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); dev_rel(fdev); @@ -437,7 +519,87 @@ return 0; } +SDT_PROBE_DEFINE1(fusefs, , vfsops, invalidate_without_export, + "struct mount*"); static int +fuse_vfsop_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) +{ + struct fuse_data *data = fuse_get_mpdata(mp); + uint64_t nodeid = ino; + struct thread *td = curthread; + struct fuse_dispatcher fdi; + struct fuse_entry_out *feo; + struct fuse_vnode_data *fvdat; + const char dot[] = "."; + off_t filesize; + enum vtype vtyp; + int error; + + if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) { + /* + * Unreachable unless you do something stupid, like export a + * nullfs mount of a fusefs file system. + */ + SDT_PROBE1(fusefs, , vfsops, invalidate_without_export, mp); + return (EOPNOTSUPP); + } + + error = fuse_internal_get_cached_vnode(mp, ino, flags, vpp); + if (error || *vpp != NULL) + return error; + + /* Do a LOOKUP, using nodeid as the parent and "." as filename */ + fdisp_init(&fdi, sizeof(dot)); + fdisp_make(&fdi, FUSE_LOOKUP, mp, nodeid, td, td->td_ucred); + memcpy(fdi.indata, dot, sizeof(dot)); + error = fdisp_wait_answ(&fdi); + + if (error) + return error; + + feo = (struct fuse_entry_out *)fdi.answ; + if (feo->nodeid == 0) { + /* zero nodeid means ENOENT and cache it */ + error = ENOENT; + goto out; + } + + vtyp = IFTOVT(feo->attr.mode); + error = fuse_vnode_get(mp, feo, nodeid, NULL, vpp, NULL, vtyp); + if (error) + goto out; + filesize = feo->attr.size; + + /* + * In the case where we are looking up a FUSE node represented by an + * existing cached vnode, and the true size reported by FUSE_LOOKUP + * doesn't match the vnode's cached size, then any cached writes beyond + * the file's current size are lost. + * + * We can get here: + * * following attribute cache expiration, or + * * due a bug in the daemon, or + */ + fvdat = VTOFUD(*vpp); + if (vnode_isreg(*vpp) && + filesize != fvdat->cached_attrs.va_size && + fvdat->flag & FN_SIZECHANGE) { + printf("%s: WB cache incoherent on %s!\n", __func__, + vnode_mount(*vpp)->mnt_stat.f_mntonname); + + fvdat->flag &= ~FN_SIZECHANGE; + } + + fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, + feo->attr_valid_nsec, NULL); + fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec, + &fvdat->entry_cache_timeout); +out: + fdisp_destroy(&fdi); + return error; +} + +static int fuse_vfsop_root(struct mount *mp, int lkflags, struct vnode **vpp) { struct fuse_data *data = fuse_get_mpdata(mp); @@ -454,13 +616,13 @@ FUSE_LOCK(); MPASS(data->vroot == NULL || data->vroot == *vpp); if (data->vroot == NULL) { - SDT_PROBE2(fuse, , vfsops, trace, 1, + SDT_PROBE2(fusefs, , vfsops, trace, 1, "new root vnode"); data->vroot = *vpp; FUSE_UNLOCK(); vref(*vpp); } else if (data->vroot != *vpp) { - SDT_PROBE2(fuse, , vfsops, trace, 1, + SDT_PROBE2(fusefs, , vfsops, trace, 1, "root vnode race"); FUSE_UNLOCK(); VOP_UNLOCK(*vpp, 0); @@ -523,7 +685,7 @@ sbp->f_files = 0; sbp->f_ffree = 0; sbp->f_namemax = 0; - sbp->f_bsize = FUSE_DEFAULT_BLOCKSIZE; + sbp->f_bsize = S_BLKSIZE; return 0; } Index: sys/fs/fuse/fuse_vnops.c =================================================================== --- sys/fs/fuse/fuse_vnops.c +++ sys/fs/fuse/fuse_vnops.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -102,24 +107,30 @@ #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" -#include "fuse_param.h" #include "fuse_io.h" #include -SDT_PROVIDER_DECLARE(fuse); +/* Maximum number of hardlinks to a single FUSE file */ +#define FUSE_LINK_MAX UINT32_MAX + +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , vnops, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*"); /* vnode ops */ static vop_access_t fuse_vnop_access; +static vop_advlock_t fuse_vnop_advlock; +static vop_bmap_t fuse_vnop_bmap; +static vop_close_t fuse_fifo_close; static vop_close_t fuse_vnop_close; static vop_create_t fuse_vnop_create; static vop_deleteextattr_t fuse_vnop_deleteextattr; +static vop_fdatasync_t fuse_vnop_fdatasync; static vop_fsync_t fuse_vnop_fsync; static vop_getattr_t fuse_vnop_getattr; static vop_getextattr_t fuse_vnop_getextattr; @@ -144,19 +155,44 @@ static vop_symlink_t fuse_vnop_symlink; static vop_write_t fuse_vnop_write; static vop_getpages_t fuse_vnop_getpages; -static vop_putpages_t fuse_vnop_putpages; static vop_print_t fuse_vnop_print; +static vop_vptofh_t fuse_vnop_vptofh; +struct vop_vector fuse_fifoops = { + .vop_default = &fifo_specops, + .vop_access = fuse_vnop_access, + .vop_close = fuse_fifo_close, + .vop_fsync = fuse_vnop_fsync, + .vop_getattr = fuse_vnop_getattr, + .vop_inactive = fuse_vnop_inactive, + .vop_pathconf = fuse_vnop_pathconf, + .vop_print = fuse_vnop_print, + .vop_read = VOP_PANIC, + .vop_reclaim = fuse_vnop_reclaim, + .vop_setattr = fuse_vnop_setattr, + .vop_write = VOP_PANIC, + .vop_vptofh = fuse_vnop_vptofh, +}; + struct vop_vector fuse_vnops = { + .vop_allocate = VOP_EINVAL, .vop_default = &default_vnodeops, .vop_access = fuse_vnop_access, + .vop_advlock = fuse_vnop_advlock, + .vop_bmap = fuse_vnop_bmap, .vop_close = fuse_vnop_close, .vop_create = fuse_vnop_create, .vop_deleteextattr = fuse_vnop_deleteextattr, .vop_fsync = fuse_vnop_fsync, + .vop_fdatasync = fuse_vnop_fdatasync, .vop_getattr = fuse_vnop_getattr, .vop_getextattr = fuse_vnop_getextattr, .vop_inactive = fuse_vnop_inactive, + /* + * TODO: implement vop_ioctl after upgrading to protocol 7.16. + * FUSE_IOCTL was added in 7.11, but 32-bit compat is broken until + * 7.16. + */ .vop_link = fuse_vnop_link, .vop_listextattr = fuse_vnop_listextattr, .vop_lookup = fuse_vnop_lookup, @@ -164,6 +200,12 @@ .vop_mknod = fuse_vnop_mknod, .vop_open = fuse_vnop_open, .vop_pathconf = fuse_vnop_pathconf, + /* + * TODO: implement vop_poll after upgrading to protocol 7.21. + * FUSE_POLL was added in protocol 7.11, but it's kind of broken until + * 7.21, which adds the ability for the client to choose which poll + * events it wants, and for a client to deregister a file handle + */ .vop_read = fuse_vnop_read, .vop_readdir = fuse_vnop_readdir, .vop_readlink = fuse_vnop_readlink, @@ -177,41 +219,103 @@ .vop_symlink = fuse_vnop_symlink, .vop_write = fuse_vnop_write, .vop_getpages = fuse_vnop_getpages, - .vop_putpages = fuse_vnop_putpages, .vop_print = fuse_vnop_print, + .vop_vptofh = fuse_vnop_vptofh, }; -static u_long fuse_lookup_cache_hits = 0; +uma_zone_t fuse_pbuf_zone; -SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_hits, CTLFLAG_RD, - &fuse_lookup_cache_hits, 0, "number of positive cache hits in lookup"); +#define fuse_vm_page_lock(m) vm_page_lock((m)); +#define fuse_vm_page_unlock(m) vm_page_unlock((m)); +#define fuse_vm_page_lock_queues() ((void)0) +#define fuse_vm_page_unlock_queues() ((void)0) -static u_long fuse_lookup_cache_misses = 0; +/* Check permission for extattr operations, much like extattr_check_cred */ +static int +fuse_extattr_check_cred(struct vnode *vp, int ns, struct ucred *cred, + struct thread *td, accmode_t accmode) +{ + struct mount *mp = vnode_mount(vp); + struct fuse_data *data = fuse_get_mpdata(mp); -SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_misses, CTLFLAG_RD, - &fuse_lookup_cache_misses, 0, "number of cache misses in lookup"); + /* + * Kernel-invoked always succeeds. + */ + if (cred == NOCRED) + return (0); -int fuse_lookup_cache_enable = 1; + /* + * Do not allow privileged processes in jail to directly manipulate + * system attributes. + */ + switch (ns) { + case EXTATTR_NAMESPACE_SYSTEM: + if (data->dataflags & FSESS_DEFAULT_PERMISSIONS) { + return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); + } + /* FALLTHROUGH */ + case EXTATTR_NAMESPACE_USER: + return (fuse_internal_access(vp, accmode, td, cred)); + default: + return (EPERM); + } +} -SYSCTL_INT(_vfs_fusefs, OID_AUTO, lookup_cache_enable, CTLFLAG_RW, - &fuse_lookup_cache_enable, 0, "if non-zero, enable lookup cache"); +/* Get a filehandle for a directory */ +static int +fuse_filehandle_get_dir(struct vnode *vp, struct fuse_filehandle **fufhp, + struct ucred *cred, pid_t pid) +{ + if (fuse_filehandle_get(vp, FREAD, fufhp, cred, pid) == 0) + return 0; + return fuse_filehandle_get(vp, FEXEC, fufhp, cred, pid); +} -/* - * XXX: This feature is highly experimental and can bring to instabilities, - * needs revisiting before to be enabled by default. - */ -static int fuse_reclaim_revoked = 0; +/* Send FUSE_FLUSH for this vnode */ +static int +fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) +{ + struct fuse_flush_in *ffi; + struct fuse_filehandle *fufh; + struct fuse_dispatcher fdi; + struct thread *td = curthread; + struct mount *mp = vnode_mount(vp); + int err; -SYSCTL_INT(_vfs_fusefs, OID_AUTO, reclaim_revoked, CTLFLAG_RW, - &fuse_reclaim_revoked, 0, ""); + if (!fsess_isimpl(vnode_mount(vp), FUSE_FLUSH)) + return 0; -uma_zone_t fuse_pbuf_zone; + err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); + if (err) + return err; -#define fuse_vm_page_lock(m) vm_page_lock((m)); -#define fuse_vm_page_unlock(m) vm_page_unlock((m)); -#define fuse_vm_page_lock_queues() ((void)0) -#define fuse_vm_page_unlock_queues() ((void)0) + fdisp_init(&fdi, sizeof(*ffi)); + fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred); + ffi = fdi.indata; + ffi->fh = fufh->fh_id; + /* + * If the file has a POSIX lock then we're supposed to set lock_owner. + * If not, then lock_owner is undefined. So we may as well always set + * it. + */ + ffi->lock_owner = td->td_proc->p_pid; + err = fdisp_wait_answ(&fdi); + if (err == ENOSYS) { + fsess_set_notimpl(mp, FUSE_FLUSH); + err = 0; + } + fdisp_destroy(&fdi); + return err; +} + +/* Close wrapper for fifos. */ +static int +fuse_fifo_close(struct vop_close_args *ap) +{ + return (fifo_specops.vop_close(ap)); +} + /* struct vnop_access_args { struct vnode *a_vp; @@ -231,7 +335,6 @@ int accmode = ap->a_accmode; struct ucred *cred = ap->a_cred; - struct fuse_access_param facp; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); int err; @@ -254,15 +357,192 @@ if (vnode_islnk(vp)) { return 0; } - bzero(&facp, sizeof(facp)); - err = fuse_internal_access(vp, accmode, &facp, ap->a_td, ap->a_cred); + err = fuse_internal_access(vp, accmode, ap->a_td, ap->a_cred); return err; } /* - struct vnop_close_args { + * struct vop_advlock_args { + * struct vop_generic_args a_gen; + * struct vnode *a_vp; + * void *a_id; + * int a_op; + * struct flock *a_fl; + * int a_flags; + * } + */ +static int +fuse_vnop_advlock(struct vop_advlock_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct flock *fl = ap->a_fl; + struct thread *td = curthread; + struct ucred *cred = td->td_ucred; + pid_t pid = td->td_proc->p_pid; + struct fuse_filehandle *fufh; + struct fuse_dispatcher fdi; + struct fuse_lk_in *fli; + struct fuse_lk_out *flo; + enum fuse_opcode op; + int dataflags, err; + int flags = ap->a_flags; + + dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; + + if (fuse_isdeadfs(vp)) { + return ENXIO; + } + + if (!(dataflags & FSESS_POSIX_LOCKS)) + return vop_stdadvlock(ap); + /* FUSE doesn't properly support flock until protocol 7.17 */ + if (flags & F_FLOCK) + return vop_stdadvlock(ap); + + err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid); + if (err) + return err; + + fdisp_init(&fdi, sizeof(*fli)); + + switch(ap->a_op) { + case F_GETLK: + op = FUSE_GETLK; + break; + case F_SETLK: + op = FUSE_SETLK; + break; + case F_SETLKW: + op = FUSE_SETLKW; + break; + default: + return EINVAL; + } + + fdisp_make_vp(&fdi, op, vp, td, cred); + fli = fdi.indata; + fli->fh = fufh->fh_id; + fli->owner = fl->l_pid; + fli->lk.start = fl->l_start; + if (fl->l_len != 0) + fli->lk.end = fl->l_start + fl->l_len - 1; + else + fli->lk.end = INT64_MAX; + fli->lk.type = fl->l_type; + fli->lk.pid = fl->l_pid; + + err = fdisp_wait_answ(&fdi); + fdisp_destroy(&fdi); + + if (err == 0 && op == FUSE_GETLK) { + flo = fdi.answ; + fl->l_type = flo->lk.type; + fl->l_pid = flo->lk.pid; + if (flo->lk.type != F_UNLCK) { + fl->l_start = flo->lk.start; + if (flo->lk.end == INT64_MAX) + fl->l_len = 0; + else + fl->l_len = flo->lk.end - flo->lk.start + 1; + fl->l_start = flo->lk.start; + } + } + + return err; +} + +/* { struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; +} */ +static int +fuse_vnop_bmap(struct vop_bmap_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct bufobj **bo = ap->a_bop; + struct thread *td = curthread; + struct mount *mp; + struct fuse_dispatcher fdi; + struct fuse_bmap_in *fbi; + struct fuse_bmap_out *fbo; + struct fuse_data *data; + uint64_t biosize; + off_t filesize; + daddr_t lbn = ap->a_bn; + daddr_t *pbn = ap->a_bnp; + int *runp = ap->a_runp; + int *runb = ap->a_runb; + int error = 0; + int maxrun; + + if (fuse_isdeadfs(vp)) { + return ENXIO; + } + + mp = vnode_mount(vp); + data = fuse_get_mpdata(mp); + biosize = fuse_iosize(vp); + maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1, + data->max_readahead_blocks); + + if (bo != NULL) + *bo = &vp->v_bufobj; + + /* + * The FUSE_BMAP operation does not include the runp and runb + * variables, so we must guess. Report nonzero contiguous runs so + * cluster_read will combine adjacent reads. It's worthwhile to reduce + * upcalls even if we don't know the true physical layout of the file. + * + * FUSE file systems may opt out of read clustering in two ways: + * * mounting with -onoclusterr + * * Setting max_readahead <= maxbcachebuf during FUSE_INIT + */ + if (runb != NULL) + *runb = MIN(lbn, maxrun); + if (runp != NULL) { + error = fuse_vnode_size(vp, &filesize, td->td_ucred, td); + if (error == 0) + *runp = MIN(MAX(0, filesize / biosize - lbn - 1), + maxrun); + else + *runp = 0; + } + + if (fsess_isimpl(mp, FUSE_BMAP)) { + fdisp_init(&fdi, sizeof(*fbi)); + fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred); + fbi = fdi.indata; + fbi->block = lbn; + fbi->blocksize = biosize; + error = fdisp_wait_answ(&fdi); + if (error == ENOSYS) { + fdisp_destroy(&fdi); + fsess_set_notimpl(mp, FUSE_BMAP); + error = 0; + } else { + fbo = fdi.answ; + if (error == 0 && pbn != NULL) + *pbn = fbo->block; + fdisp_destroy(&fdi); + return error; + } + } + + /* If the daemon doesn't support BMAP, make up a sensible default */ + if (pbn != NULL) + *pbn = lbn * btodb(biosize); + return (error); +} + +/* + struct vop_close_args { + struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; @@ -274,39 +554,48 @@ struct vnode *vp = ap->a_vp; struct ucred *cred = ap->a_cred; int fflag = ap->a_fflag; - fufh_type_t fufh_type; + struct thread *td = ap->a_td; + pid_t pid = td->td_proc->p_pid; + int err = 0; - if (fuse_isdeadfs(vp)) { + if (fuse_isdeadfs(vp)) return 0; - } - if (vnode_isdir(vp)) { - if (fuse_filehandle_valid(vp, FUFH_RDONLY)) { - fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); - } + if (vnode_isdir(vp)) return 0; - } - if (fflag & IO_NDELAY) { + if (fflag & IO_NDELAY) return 0; - } - fufh_type = fuse_filehandle_xlate_from_fflags(fflag); - if (!fuse_filehandle_valid(vp, fufh_type)) { - int i; - - for (i = 0; i < FUFH_MAXTYPE; i++) - if (fuse_filehandle_valid(vp, i)) - break; - if (i == FUFH_MAXTYPE) - panic("FUSE: fufh type %d found to be invalid in close" - " (fflag=0x%x)\n", - fufh_type, fflag); - } + err = fuse_flush(vp, cred, pid, fflag); + /* TODO: close the file handle, if we're sure it's no longer used */ if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { - fuse_vnode_savesize(vp, cred); + fuse_vnode_savesize(vp, cred, td->td_proc->p_pid); } - return 0; + return err; } +static void +fdisp_make_mknod_for_fallback( + struct fuse_dispatcher *fdip, + struct componentname *cnp, + struct vnode *dvp, + uint64_t parentnid, + struct thread *td, + struct ucred *cred, + mode_t mode, + enum fuse_opcode *op) +{ + struct fuse_mknod_in *fmni; + + fdisp_init(fdip, sizeof(*fmni) + cnp->cn_namelen + 1); + *op = FUSE_MKNOD; + fdisp_make(fdip, *op, vnode_mount(dvp), parentnid, td, cred); + fmni = fdip->indata; + fmni->mode = mode; + fmni->rdev = 0; + memcpy((char *)fdip->indata + sizeof(*fmni), cnp->cn_nameptr, + cnp->cn_namelen); + ((char *)fdip->indata)[sizeof(*fmni) + cnp->cn_namelen] = '\0'; +} /* struct vnop_create_args { struct vnode *a_dvp; @@ -325,107 +614,169 @@ struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; - struct fuse_open_in *foi; + struct fuse_data *data; + struct fuse_create_in *fci; struct fuse_entry_out *feo; - struct fuse_dispatcher fdi; + struct fuse_open_out *foo; + struct fuse_dispatcher fdi, fdi2; struct fuse_dispatcher *fdip = &fdi; + struct fuse_dispatcher *fdip2 = NULL; int err; struct mount *mp = vnode_mount(dvp); + data = fuse_get_mpdata(mp); uint64_t parentnid = VTOFUD(dvp)->nid; mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode); - uint64_t x_fh_id; - uint32_t x_open_flags; + enum fuse_opcode op; + int flags; - if (fuse_isdeadfs(dvp)) { + if (fuse_isdeadfs(dvp)) return ENXIO; - } + + /* FUSE expects sockets to be created with FUSE_MKNOD */ + if (vap->va_type == VSOCK) + return fuse_internal_mknod(dvp, vpp, cnp, vap); + + /* + * VOP_CREATE doesn't tell us the open(2) flags, so we guess. Only a + * writable mode makes sense, and we might as well include readability + * too. + */ + flags = O_RDWR; + bzero(&fdi, sizeof(fdi)); - /* XXX: Will we ever want devices ? */ - if ((vap->va_type != VREG)) { - printf("fuse_vnop_create: unsupported va_type %d\n", - vap->va_type); + if (vap->va_type != VREG) return (EINVAL); - } - fdisp_init(fdip, sizeof(*foi) + cnp->cn_namelen + 1); - if (!fsess_isimpl(mp, FUSE_CREATE)) { - SDT_PROBE2(fuse, , vnops, trace, 1, - "eh, daemon doesn't implement create?"); - return (EINVAL); - } - fdisp_make(fdip, FUSE_CREATE, vnode_mount(dvp), parentnid, td, cred); + if (!fsess_isimpl(mp, FUSE_CREATE) || vap->va_type == VSOCK) { + /* Fallback to FUSE_MKNOD/FUSE_OPEN */ + fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td, + cred, mode, &op); + } else { + /* Use FUSE_CREATE */ + size_t insize; - foi = fdip->indata; - foi->mode = mode; - foi->flags = O_CREAT | O_RDWR; + op = FUSE_CREATE; + fdisp_init(fdip, sizeof(*fci) + cnp->cn_namelen + 1); + fdisp_make(fdip, op, vnode_mount(dvp), parentnid, td, cred); + fci = fdip->indata; + fci->mode = mode; + fci->flags = O_CREAT | flags; + if (fuse_libabi_geq(data, 7, 12)) { + insize = sizeof(*fci); + fci->umask = td->td_proc->p_fd->fd_cmask; + } else { + insize = sizeof(struct fuse_open_in); + } - memcpy((char *)fdip->indata + sizeof(*foi), cnp->cn_nameptr, - cnp->cn_namelen); - ((char *)fdip->indata)[sizeof(*foi) + cnp->cn_namelen] = '\0'; + memcpy((char *)fdip->indata + insize, cnp->cn_nameptr, + cnp->cn_namelen); + ((char *)fdip->indata)[insize + cnp->cn_namelen] = '\0'; + } err = fdisp_wait_answ(fdip); if (err) { - if (err == ENOSYS) + if (err == ENOSYS && op == FUSE_CREATE) { fsess_set_notimpl(mp, FUSE_CREATE); - goto out; + fdisp_destroy(fdip); + fdisp_make_mknod_for_fallback(fdip, cnp, dvp, + parentnid, td, cred, mode, &op); + err = fdisp_wait_answ(fdip); + } + if (err) + goto out; } feo = fdip->answ; - if ((err = fuse_internal_checkentry(feo, VREG))) { + if ((err = fuse_internal_checkentry(feo, vap->va_type))) { goto out; } - err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, VREG); + + if (op == FUSE_CREATE) { + foo = (struct fuse_open_out*)(feo + 1); + } else { + /* Issue a separate FUSE_OPEN */ + struct fuse_open_in *foi; + + fdip2 = &fdi2; + fdisp_init(fdip2, sizeof(*foi)); + fdisp_make(fdip2, FUSE_OPEN, vnode_mount(dvp), feo->nodeid, td, + cred); + foi = fdip2->indata; + foi->flags = flags; + err = fdisp_wait_answ(fdip2); + if (err) + goto out; + foo = fdip2->answ; + } + err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vap->va_type); if (err) { struct fuse_release_in *fri; uint64_t nodeid = feo->nodeid; - uint64_t fh_id = ((struct fuse_open_out *)(feo + 1))->fh; + uint64_t fh_id = foo->fh; fdisp_init(fdip, sizeof(*fri)); fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred); fri = fdip->indata; fri->fh = fh_id; - fri->flags = OFLAGS(mode); + fri->flags = flags; fuse_insert_callback(fdip->tick, fuse_internal_forget_callback); - fuse_insert_message(fdip->tick); - return err; + fuse_insert_message(fdip->tick, false); + goto out; } ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create"); + fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, + feo->attr_valid_nsec, NULL); - fdip->answ = feo + 1; - - x_fh_id = ((struct fuse_open_out *)(feo + 1))->fh; - x_open_flags = ((struct fuse_open_out *)(feo + 1))->open_flags; - fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, x_fh_id); - fuse_vnode_open(*vpp, x_open_flags, td); + fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, td, cred, foo); + fuse_vnode_open(*vpp, foo->open_flags, td); + /* + * Purge the parent's attribute cache because the daemon should've + * updated its mtime and ctime + */ + fuse_vnode_clear_attr_cache(dvp); cache_purge_negative(dvp); out: + if (fdip2) + fdisp_destroy(fdip2); fdisp_destroy(fdip); return err; } /* - * Our vnop_fsync roughly corresponds to the FUSE_FSYNC method. The Linux - * version of FUSE also has a FUSE_FLUSH method. - * - * On Linux, fsync() synchronizes a file's complete in-core state with that - * on disk. The call is not supposed to return until the system has completed - * that action or until an error is detected. - * - * Linux also has an fdatasync() call that is similar to fsync() but is not - * required to update the metadata such as access time and modification time. - */ + struct vnop_fdatasync_args { + struct vop_generic_args a_gen; + struct vnode * a_vp; + struct thread * a_td; + }; +*/ +static int +fuse_vnop_fdatasync(struct vop_fdatasync_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct thread *td = ap->a_td; + int waitfor = MNT_WAIT; + int err = 0; + + if (fuse_isdeadfs(vp)) { + return 0; + } + if ((err = vop_stdfdatasync_buf(ap))) + return err; + + return fuse_internal_fsync(vp, td, waitfor, true); +} + /* struct vnop_fsync_args { - struct vnodeop_desc *a_desc; + struct vop_generic_args a_gen; struct vnode * a_vp; - struct ucred * a_cred; int a_waitfor; struct thread * a_td; }; @@ -435,31 +786,16 @@ { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; + int waitfor = ap->a_waitfor; + int err = 0; - struct fuse_filehandle *fufh; - struct fuse_vnode_data *fvdat = VTOFUD(vp); - - int type, err = 0; - if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfsync(ap))) return err; - if (!fsess_isimpl(vnode_mount(vp), - (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) { - goto out; - } - for (type = 0; type < FUFH_MAXTYPE; type++) { - fufh = &(fvdat->fufh[type]); - if (FUFH_IS_VALID(fufh)) { - fuse_internal_fsync(vp, td, NULL, fufh); - } - } - -out: - return 0; + return fuse_internal_fsync(vp, td, waitfor, false); } /* @@ -477,12 +813,9 @@ struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; - struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct fuse_attr_out *fao; int err = 0; int dataflags; - struct fuse_dispatcher fdi; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; @@ -497,48 +830,14 @@ goto fake; } } - fdisp_init(&fdi, 0); - if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) { - if ((err == ENOTCONN) && vnode_isvroot(vp)) { - /* see comment in fuse_vfsop_statfs() */ - fdisp_destroy(&fdi); - goto fake; - } - if (err == ENOENT) { - fuse_internal_vnode_disappear(vp); - } - goto out; + err = fuse_internal_getattr(vp, vap, cred, td); + if (err == ENOTCONN && vnode_isvroot(vp)) { + /* see comment in fuse_vfsop_statfs() */ + goto fake; + } else { + return err; } - fao = (struct fuse_attr_out *)fdi.answ; - fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, - fao->attr_valid_nsec, vap); - if (vap->va_type != vnode_vtype(vp)) { - fuse_internal_vnode_disappear(vp); - err = ENOENT; - goto out; - } - if ((fvdat->flag & FN_SIZECHANGE) != 0) - vap->va_size = fvdat->filesize; - - if (vnode_isreg(vp) && (fvdat->flag & FN_SIZECHANGE) == 0) { - /* - * This is for those cases when the file size changed without us - * knowing, and we want to catch up. - */ - off_t new_filesize = ((struct fuse_attr_out *) - fdi.answ)->attr.size; - - if (fvdat->filesize != new_filesize) { - fuse_vnode_setsize(vp, new_filesize); - fvdat->flag &= ~FN_SIZECHANGE; - } - } - -out: - fdisp_destroy(&fdi); - return err; - fake: bzero(vap, sizeof(*vap)); vap->va_type = vnode_vtype(vp); @@ -559,31 +858,27 @@ struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct fuse_filehandle *fufh = NULL; + struct fuse_filehandle *fufh, *fufh_tmp; - int type, need_flush = 1; + int need_flush = 1; - for (type = 0; type < FUFH_MAXTYPE; type++) { - fufh = &(fvdat->fufh[type]); - if (FUFH_IS_VALID(fufh)) { - if (need_flush && vp->v_type == VREG) { - if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { - fuse_vnode_savesize(vp, NULL); - } - if (fuse_data_cache_invalidate || - (fvdat->flag & FN_REVOKED) != 0) - fuse_io_invalbuf(vp, td); - else - fuse_io_flushbuf(vp, MNT_WAIT, td); - need_flush = 0; + LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { + if (need_flush && vp->v_type == VREG) { + if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { + fuse_vnode_savesize(vp, NULL, 0); } - fuse_filehandle_close(vp, type, td, NULL); + if ((fvdat->flag & FN_REVOKED) != 0) + fuse_io_invalbuf(vp, td); + else + fuse_io_flushbuf(vp, MNT_WAIT, td); + need_flush = 0; } + fuse_filehandle_close(vp, fufh, td, NULL); } - if ((fvdat->flag & FN_REVOKED) != 0 && fuse_reclaim_revoked) { + if ((fvdat->flag & FN_REVOKED) != 0) vrecycle(vp); - } + return 0; } @@ -635,11 +930,39 @@ feo = fdi.answ; err = fuse_internal_checkentry(feo, vnode_vtype(vp)); + if (!err) { + /* + * Purge the parent's attribute cache because the daemon + * should've updated its mtime and ctime + */ + fuse_vnode_clear_attr_cache(tdvp); + fuse_internal_cache_attrs(vp, &feo->attr, feo->attr_valid, + feo->attr_valid_nsec, NULL); + } out: fdisp_destroy(&fdi); return err; } +struct fuse_lookup_alloc_arg { + struct fuse_entry_out *feo; + struct componentname *cnp; + uint64_t nid; + enum vtype vtyp; +}; + +/* Callback for vn_get_ino */ +static int +fuse_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) +{ + struct fuse_lookup_alloc_arg *flaa = arg; + + return fuse_vnode_get(mp, flaa->feo, flaa->nid, NULL, vpp, flaa->cnp, + flaa->vtyp); +} + +SDT_PROBE_DEFINE3(fusefs, , vnops, cache_lookup, + "int", "struct timespec*", "struct timespec*"); /* struct vnop_lookup_args { struct vnodeop_desc *a_desc; @@ -668,268 +991,146 @@ struct vnode *vp = NULL; struct fuse_dispatcher fdi; - enum fuse_opcode op; + bool did_lookup = false; + struct fuse_entry_out *feo = NULL; + enum vtype vtyp; /* vnode type of target */ + off_t filesize; /* filesize of target */ uint64_t nid; - struct fuse_access_param facp; if (fuse_isdeadfs(dvp)) { *vpp = NULL; return ENXIO; } - if (!vnode_isdir(dvp)) { + if (!vnode_isdir(dvp)) return ENOTDIR; - } - if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) { + + if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) return EROFS; - } - /* - * We do access check prior to doing anything else only in the case - * when we are at fs root (we'd like to say, "we are at the first - * component", but that's not exactly the same... nevermind). - * See further comments at further access checks. - */ - bzero(&facp, sizeof(facp)); - if (vnode_isvroot(dvp)) { /* early permission check hack */ - if ((err = fuse_internal_access(dvp, VEXEC, &facp, td, cred))) { - return err; - } - } + if ((err = fuse_internal_access(dvp, VEXEC, td, cred))) + return err; + if (flags & ISDOTDOT) { + KASSERT(VTOFUD(dvp)->flag & FN_PARENT_NID, + ("Looking up .. is TODO")); nid = VTOFUD(dvp)->parent_nid; - if (nid == 0) { + if (nid == 0) return ENOENT; - } - fdisp_init(&fdi, 0); - op = FUSE_GETATTR; - goto calldaemon; + /* .. is obviously a directory */ + vtyp = VDIR; + filesize = 0; } else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') { nid = VTOI(dvp); - fdisp_init(&fdi, 0); - op = FUSE_GETATTR; - goto calldaemon; - } else if (fuse_lookup_cache_enable) { - err = cache_lookup(dvp, vpp, cnp, NULL, NULL); - switch (err) { + /* . is obviously a directory */ + vtyp = VDIR; + filesize = 0; + } else { + struct timespec now, timeout; + err = cache_lookup(dvp, vpp, cnp, &timeout, NULL); + getnanouptime(&now); + SDT_PROBE3(fusefs, , vnops, cache_lookup, err, &timeout, &now); + switch (err) { case -1: /* positive match */ - atomic_add_acq_long(&fuse_lookup_cache_hits, 1); + if (timespeccmp(&timeout, &now, >)) { + counter_u64_add(fuse_lookup_cache_hits, 1); + } else { + /* Cache timeout */ + counter_u64_add(fuse_lookup_cache_misses, 1); + bintime_clear( + &VTOFUD(*vpp)->entry_cache_timeout); + cache_purge(*vpp); + if (dvp != *vpp) + vput(*vpp); + else + vrele(*vpp); + *vpp = NULL; + break; + } return 0; case 0: /* no match in cache */ - atomic_add_acq_long(&fuse_lookup_cache_misses, 1); + counter_u64_add(fuse_lookup_cache_misses, 1); break; case ENOENT: /* negative match */ + getnanouptime(&now); + if (timespeccmp(&timeout, &now, <=)) { + /* Cache timeout */ + cache_purge_negative(dvp); + break; + } /* fall through */ default: return err; } - } - nid = VTOI(dvp); - fdisp_init(&fdi, cnp->cn_namelen + 1); - op = FUSE_LOOKUP; -calldaemon: - fdisp_make(&fdi, op, mp, nid, td, cred); + nid = VTOI(dvp); + fdisp_init(&fdi, cnp->cn_namelen + 1); + fdisp_make(&fdi, FUSE_LOOKUP, mp, nid, td, cred); - if (op == FUSE_LOOKUP) { memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; - } - lookup_err = fdisp_wait_answ(&fdi); + lookup_err = fdisp_wait_answ(&fdi); + did_lookup = true; - if ((op == FUSE_LOOKUP) && !lookup_err) { /* lookup call succeeded */ - nid = ((struct fuse_entry_out *)fdi.answ)->nodeid; - if (!nid) { - /* - * zero nodeid is the same as "not found", - * but it's also cacheable (which we keep - * keep on doing not as of writing this) - */ - lookup_err = ENOENT; - } else if (nid == FUSE_ROOT_ID) { - lookup_err = EINVAL; + if (!lookup_err) { + /* lookup call succeeded */ + feo = (struct fuse_entry_out *)fdi.answ; + nid = feo->nodeid; + if (nid == 0) { + /* zero nodeid means ENOENT and cache it */ + struct timespec timeout; + + fdi.answ_stat = ENOENT; + lookup_err = ENOENT; + if (cnp->cn_flags & MAKEENTRY) { + fuse_validity_2_timespec(feo, &timeout); + cache_enter_time(dvp, *vpp, cnp, + &timeout, NULL); + } + } else if (nid == FUSE_ROOT_ID) { + lookup_err = EINVAL; + } + vtyp = IFTOVT(feo->attr.mode); + filesize = feo->attr.size; } + if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT)) { + fdisp_destroy(&fdi); + return lookup_err; + } } - if (lookup_err && - (!fdi.answ_stat || lookup_err != ENOENT || op != FUSE_LOOKUP)) { - fdisp_destroy(&fdi); - return lookup_err; - } /* lookup_err, if non-zero, must be ENOENT at this point */ if (lookup_err) { + /* Entry not found */ + if ((nameiop == CREATE || nameiop == RENAME) && islastcn) { + err = fuse_internal_access(dvp, VWRITE, td, cred); + if (!err) { + /* + * Set the SAVENAME flag to hold onto the + * pathname for use later in VOP_CREATE or + * VOP_RENAME. + */ + cnp->cn_flags |= SAVENAME; - if ((nameiop == CREATE || nameiop == RENAME) && islastcn - /* && directory dvp has not been removed */ ) { - - if (vfs_isrdonly(mp)) { - err = EROFS; - goto out; + err = EJUSTRETURN; } -#if 0 /* THINK_ABOUT_THIS */ - if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) { - goto out; - } -#endif - - /* - * Possibly record the position of a slot in the - * directory large enough for the new component name. - * This can be recorded in the vnode private data for - * dvp. Set the SAVENAME flag to hold onto the - * pathname for use later in VOP_CREATE or VOP_RENAME. - */ - cnp->cn_flags |= SAVENAME; - - err = EJUSTRETURN; - goto out; - } - /* Consider inserting name into cache. */ - - /* - * No we can't use negative caching, as the fs - * changes are out of our control. - * False positives' falseness turns out just as things - * go by, but false negatives' falseness doesn't. - * (and aiding the caching mechanism with extra control - * mechanisms comes quite close to beating the whole purpose - * caching...) - */ -#if 0 - if ((cnp->cn_flags & MAKEENTRY) != 0) { - SDT_PROBE2(fuse, , vnops, trace, 1, - "inserting NULL into cache"); - cache_enter(dvp, NULL, cnp); - } -#endif - err = ENOENT; - goto out; - - } else { - - /* !lookup_err */ - - struct fuse_entry_out *feo = NULL; - struct fuse_attr *fattr = NULL; - - if (op == FUSE_GETATTR) { - fattr = &((struct fuse_attr_out *)fdi.answ)->attr; } else { - feo = (struct fuse_entry_out *)fdi.answ; - fattr = &(feo->attr); + err = ENOENT; } - - /* - * If deleting, and at end of pathname, return parameters - * which can be used to remove file. If the wantparent flag - * isn't set, we return only the directory, otherwise we go on - * and lock the inode, being careful with ".". - */ - if (nameiop == DELETE && islastcn) { - /* - * Check for write access on directory. - */ - facp.xuid = fattr->uid; - facp.facc_flags |= FACCESS_STICKY; - err = fuse_internal_access(dvp, VWRITE, &facp, td, cred); - facp.facc_flags &= ~FACCESS_XQUERIES; - - if (err) { - goto out; - } - if (nid == VTOI(dvp)) { - vref(dvp); - *vpp = dvp; - } else { - err = fuse_vnode_get(dvp->v_mount, feo, nid, - dvp, &vp, cnp, IFTOVT(fattr->mode)); - if (err) - goto out; - *vpp = vp; - } - - /* - * Save the name for use in VOP_RMDIR and VOP_REMOVE - * later. - */ - cnp->cn_flags |= SAVENAME; - goto out; - - } - /* - * If rewriting (RENAME), return the inode and the - * information required to rewrite the present directory - * Must get inode of directory entry to verify it's a - * regular file, or empty directory. - */ - if (nameiop == RENAME && wantparent && islastcn) { - -#if 0 /* THINK_ABOUT_THIS */ - if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) { - goto out; - } -#endif - - /* - * Check for "." - */ - if (nid == VTOI(dvp)) { - err = EISDIR; - goto out; - } - err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp, - &vp, cnp, IFTOVT(fattr->mode)); - if (err) { - goto out; - } - *vpp = vp; - /* - * Save the name for use in VOP_RENAME later. - */ - cnp->cn_flags |= SAVENAME; - - goto out; - } + } else { + /* Entry was found */ if (flags & ISDOTDOT) { - struct mount *mp; - int ltype; + struct fuse_lookup_alloc_arg flaa; - /* - * Expanded copy of vn_vget_ino() so that - * fuse_vnode_get() can be used. - */ - mp = dvp->v_mount; - ltype = VOP_ISLOCKED(dvp); - err = vfs_busy(mp, MBF_NOWAIT); - if (err != 0) { - vfs_ref(mp); - VOP_UNLOCK(dvp, 0); - err = vfs_busy(mp, 0); - vn_lock(dvp, ltype | LK_RETRY); - vfs_rel(mp); - if (err) - goto out; - if ((dvp->v_iflag & VI_DOOMED) != 0) { - err = ENOENT; - vfs_unbusy(mp); - goto out; - } - } - VOP_UNLOCK(dvp, 0); - err = fuse_vnode_get(vnode_mount(dvp), feo, nid, NULL, - &vp, cnp, IFTOVT(fattr->mode)); - vfs_unbusy(mp); - vn_lock(dvp, ltype | LK_RETRY); - if ((dvp->v_iflag & VI_DOOMED) != 0) { - if (err == 0) - vput(vp); - err = ENOENT; - } - if (err) - goto out; + flaa.nid = nid; + flaa.feo = feo; + flaa.cnp = cnp; + flaa.vtyp = vtyp; + err = vn_vget_ino_gen(dvp, fuse_lookup_alloc, &flaa, 0, + &vp); *vpp = vp; } else if (nid == VTOI(dvp)) { vref(dvp); @@ -938,25 +1139,26 @@ struct fuse_vnode_data *fvdat; err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp, - &vp, cnp, IFTOVT(fattr->mode)); - if (err) { + &vp, cnp, vtyp); + if (err) goto out; - } - fuse_vnode_setparent(vp, dvp); + *vpp = vp; /* * In the case where we are looking up a FUSE node * represented by an existing cached vnode, and the * true size reported by FUSE_LOOKUP doesn't match - * the vnode's cached size, fix the vnode cache to - * match the real object size. + * the vnode's cached size, then any cached writes + * beyond the file's current size are lost. * - * This can occur via FUSE distributed filesystems, - * irregular files, etc. + * We can get here: + * * following attribute cache expiration, or + * * due a bug in the daemon, or */ fvdat = VTOFUD(vp); if (vnode_isreg(vp) && - fattr->size != fvdat->filesize) { + filesize != fvdat->cached_attrs.va_size && + fvdat->flag & FN_SIZECHANGE) { /* * The FN_SIZECHANGE flag reflects a dirty * append. If userspace lets us know our cache @@ -966,131 +1168,64 @@ * * XXX: Maybe disable WB caching on this mount. */ - if (fvdat->flag & FN_SIZECHANGE) - printf("%s: WB cache incoherent on " - "%s!\n", __func__, - vnode_mount(vp)->mnt_stat.f_mntonname); + printf("%s: WB cache incoherent on %s!\n", + __func__, + vnode_mount(vp)->mnt_stat.f_mntonname); - (void)fuse_vnode_setsize(vp, fattr->size); fvdat->flag &= ~FN_SIZECHANGE; } - *vpp = vp; - } - if (op == FUSE_GETATTR) { - struct fuse_attr_out *fao = - (struct fuse_attr_out*)fdi.answ; - fuse_internal_cache_attrs(*vpp, - &fao->attr, fao->attr_valid, - fao->attr_valid_nsec, NULL); - } else { - struct fuse_entry_out *feo = - (struct fuse_entry_out*)fdi.answ; - fuse_internal_cache_attrs(*vpp, - &feo->attr, feo->attr_valid, - feo->attr_valid_nsec, NULL); - } + MPASS(feo != NULL); + fuse_internal_cache_attrs(*vpp, &feo->attr, + feo->attr_valid, feo->attr_valid_nsec, NULL); + fuse_validity_2_bintime(feo->entry_valid, + feo->entry_valid_nsec, + &fvdat->entry_cache_timeout); - /* Insert name into cache if appropriate. */ + if ((nameiop == DELETE || nameiop == RENAME) && + islastcn) + { + struct vattr dvattr; - /* - * Nooo, caching is evil. With caching, we can't avoid stale - * information taking over the playground (cached info is not - * just positive/negative, it does have qualitative aspects, - * too). And a (VOP/FUSE)_GETATTR is always thrown anyway, when - * walking down along cached path components, and that's not - * any cheaper than FUSE_LOOKUP. This might change with - * implementing kernel side attr caching, but... In Linux, - * lookup results are not cached, and the daemon is bombarded - * with FUSE_LOOKUPS on and on. This shows that by design, the - * daemon is expected to handle frequent lookup queries - * efficiently, do its caching in userspace, and so on. - * - * So just leave the name cache alone. - */ - - /* - * Well, now I know, Linux caches lookups, but with a - * timeout... So it's the same thing as attribute caching: - * we can deal with it when implement timeouts. - */ -#if 0 - if (cnp->cn_flags & MAKEENTRY) { - cache_enter(dvp, *vpp, cnp); - } -#endif - } -out: - if (!lookup_err) { - - /* No lookup error; need to clean up. */ - - if (err) { /* Found inode; exit with no vnode. */ - if (op == FUSE_LOOKUP) { - fuse_internal_forget_send(vnode_mount(dvp), td, cred, - nid, 1); - } - fdisp_destroy(&fdi); - return err; - } else { -#ifndef NO_EARLY_PERM_CHECK_HACK - if (!islastcn) { - /* - * We have the attributes of the next item - * *now*, and it's a fact, and we do not - * have to do extra work for it (ie, beg the - * daemon), and it neither depends on such - * accidental things like attr caching. So - * the big idea: check credentials *now*, - * not at the beginning of the next call to - * lookup. - * - * The first item of the lookup chain (fs root) - * won't be checked then here, of course, as - * its never "the next". But go and see that - * the root is taken care about at the very - * beginning of this function. - * - * Now, given we want to do the access check - * this way, one might ask: so then why not - * do the access check just after fetching - * the inode and its attributes from the - * daemon? Why bother with producing the - * corresponding vnode at all if something - * is not OK? We know what's the deal as - * soon as we get those attrs... There is - * one bit of info though not given us by - * the daemon: whether his response is - * authoritative or not... His response should - * be ignored if something is mounted over - * the dir in question. But that can be - * known only by having the vnode... + err = fuse_internal_access(dvp, VWRITE, td, + cred); + if (err != 0) + goto out; + /* + * if the parent's sticky bit is set, check + * whether we're allowed to remove the file. + * Need to figure out the vnode locking to make + * this work. */ - int tmpvtype = vnode_vtype(*vpp); - - bzero(&facp, sizeof(facp)); - /*the early perm check hack */ - facp.facc_flags |= FACCESS_VA_VALID; - - if ((tmpvtype != VDIR) && (tmpvtype != VLNK)) { - err = ENOTDIR; + fuse_internal_getattr(dvp, &dvattr, cred, td); + if ((dvattr.va_mode & S_ISTXT) && + fuse_internal_access(dvp, VADMIN, td, + cred) && + fuse_internal_access(*vpp, VADMIN, td, + cred)) { + err = EPERM; + goto out; } - if (!err && !vnode_mountedhere(*vpp)) { - err = fuse_internal_access(*vpp, VEXEC, &facp, td, cred); - } - if (err) { - if (tmpvtype == VLNK) - SDT_PROBE2(fuse, , vnops, trace, - 1, "weird, permission " - "error with a symlink?"); - vput(*vpp); - *vpp = NULL; - } } -#endif + + if (islastcn && ( + (nameiop == DELETE) || + (nameiop == RENAME && wantparent))) { + cnp->cn_flags |= SAVENAME; + } + } } - fdisp_destroy(&fdi); +out: + if (err) { + if (vp != NULL && dvp != vp) + vput(vp); + else if (vp != NULL) + vrele(vp); + *vpp = NULL; + } + if (did_lookup) + fdisp_destroy(&fdi); return err; } @@ -1117,6 +1252,7 @@ return ENXIO; } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); + fmdi.umask = curthread->td_proc->p_fd->fd_cmask; return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi, sizeof(fmdi), VDIR)); @@ -1134,12 +1270,19 @@ fuse_vnop_mknod(struct vop_mknod_args *ap) { - return (EINVAL); -} + struct vnode *dvp = ap->a_dvp; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + struct vattr *vap = ap->a_vap; + if (fuse_isdeadfs(dvp)) + return ENXIO; + return fuse_internal_mknod(dvp, vpp, cnp, vap); +} + /* - struct vnop_open_args { + struct vop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; @@ -1151,50 +1294,27 @@ fuse_vnop_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; - int mode = ap->a_mode; + int a_mode = ap->a_mode; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; - - fufh_type_t fufh_type; + pid_t pid = td->td_proc->p_pid; struct fuse_vnode_data *fvdat; - int error, isdir = 0; - int32_t fuse_open_flags; - - if (fuse_isdeadfs(vp)) { + if (fuse_isdeadfs(vp)) return ENXIO; - } - if ((mode & (FREAD | FWRITE)) == 0) + if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO) + return (EOPNOTSUPP); + if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0) return EINVAL; fvdat = VTOFUD(vp); - if (vnode_isdir(vp)) { - isdir = 1; - } - fuse_open_flags = 0; - if (isdir) { - fufh_type = FUFH_RDONLY; - } else { - fufh_type = fuse_filehandle_xlate_from_fflags(mode); - /* - * For WRONLY opens, force DIRECT_IO. This is necessary - * since writing a partial block through the buffer cache - * will result in a read of the block and that read won't - * be allowed by the WRONLY open. - */ - if (fufh_type == FUFH_WRONLY || - (fvdat->flag & FN_DIRECTIO) != 0) - fuse_open_flags = FOPEN_DIRECT_IO; - } - - if (fuse_filehandle_validrw(vp, fufh_type) != FUFH_INVALID) { - fuse_vnode_open(vp, fuse_open_flags, td); + if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) { + fuse_vnode_open(vp, 0, td); return 0; } - error = fuse_filehandle_open(vp, fufh_type, NULL, td, cred); - return error; + return fuse_filehandle_open(vp, a_mode, NULL, td, cred); } static int @@ -1237,6 +1357,7 @@ struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; + pid_t pid = curthread->td_proc->p_pid; if (fuse_isdeadfs(vp)) { return ENXIO; @@ -1246,7 +1367,7 @@ ioflag |= IO_DIRECT; } - return fuse_io_dispatch(vp, uio, ioflag, cred); + return fuse_io_dispatch(vp, uio, ioflag, cred, pid); } /* @@ -1255,7 +1376,7 @@ struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; - int *ncookies; + int *a_ncookies; u_long **a_cookies; }; */ @@ -1265,13 +1386,18 @@ struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; - struct fuse_filehandle *fufh = NULL; struct fuse_iov cookediov; - int err = 0; - int freefufh = 0; + u_long *cookies; + off_t startoff; + ssize_t tresid; + int ncookies; + bool closefufh = false; + pid_t pid = curthread->td_proc->p_pid; + if (ap->a_eofflag) + *ap->a_eofflag = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } @@ -1280,26 +1406,61 @@ return EINVAL; } - if (!fuse_filehandle_valid(vp, FUFH_RDONLY)) { - SDT_PROBE2(fuse, , vnops, trace, 1, - "calling readdir() before open()"); - err = fuse_filehandle_open(vp, FUFH_RDONLY, &fufh, NULL, cred); - freefufh = 1; - } else { - err = fuse_filehandle_get(vp, FUFH_RDONLY, &fufh); + tresid = uio->uio_resid; + startoff = uio->uio_offset; + err = fuse_filehandle_get_dir(vp, &fufh, cred, pid); + if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { + /* + * nfsd will do VOP_READDIR without first doing VOP_OPEN. We + * must implicitly open the directory here + */ + err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred); + if (err == 0) { + /* + * When a directory is opened, it must be read from + * the beginning. Hopefully, the "startoff" still + * exists as an offset cookie for the directory. + * If not, it will read the entire directory without + * returning any entries and just return eof. + */ + uio->uio_offset = 0; + } + closefufh = true; } - if (err) { + if (err) return (err); + if (ap->a_ncookies != NULL) { + ncookies = uio->uio_resid / + (offsetof(struct dirent, d_name) + 4) + 1; + cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); + *ap->a_ncookies = ncookies; + *ap->a_cookies = cookies; + } else { + ncookies = 0; + cookies = NULL; } #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1) fiov_init(&cookediov, DIRCOOKEDSIZE); - err = fuse_internal_readdir(vp, uio, fufh, &cookediov); + err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov, + &ncookies, cookies); fiov_teardown(&cookediov); - if (freefufh) { - fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); + if (closefufh) + fuse_filehandle_close(vp, fufh, curthread, cred); + + if (ap->a_ncookies != NULL) { + if (err == 0) { + *ap->a_ncookies -= ncookies; + } else { + free(*ap->a_cookies, M_TEMP); + *ap->a_ncookies = 0; + *ap->a_cookies = NULL; + } } + if (err == 0 && tresid == uio->uio_resid) + *ap->a_eofflag = 1; + return err; } @@ -1356,22 +1517,16 @@ { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; - struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct fuse_filehandle *fufh = NULL; + struct fuse_filehandle *fufh, *fufh_tmp; - int type; - if (!fvdat) { panic("FUSE: no vnode data during recycling"); } - for (type = 0; type < FUFH_MAXTYPE; type++) { - fufh = &(fvdat->fufh[type]); - if (FUFH_IS_VALID(fufh)) { - printf("FUSE: vnode being reclaimed but fufh (type=%d) is valid", - type); - fuse_filehandle_close(vp, type, td, NULL); - } + LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { + printf("FUSE: vnode being reclaimed with open fufh " + "(type=%#x)", fufh->fufh_type); + fuse_filehandle_close(vp, fufh, td, NULL); } if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) { @@ -1409,12 +1564,9 @@ if (vnode_isdir(vp)) { return EPERM; } - cache_purge(vp); err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); - if (err == 0) - fuse_internal_vnode_disappear(vp); return err; } @@ -1438,7 +1590,8 @@ struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct fuse_data *data; - + bool newparent = fdvp != tdvp; + bool isdir = fvp->v_type == VDIR; int err = 0; if (fuse_isdeadfs(fdvp)) { @@ -1446,7 +1599,7 @@ } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { - SDT_PROBE2(fuse, , vnops, trace, 1, "cross-device rename"); + SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename"); err = EXDEV; goto out; } @@ -1457,7 +1610,17 @@ * under the source directory in the file system tree. * Linux performs this check at VFS level. */ + /* + * If source is a directory, and it will get a new parent, user must + * have write permission to it, so ".." can be modified. + */ data = fuse_get_mpdata(vnode_mount(tdvp)); + if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) { + err = fuse_internal_access(fvp, VWRITE, + tcnp->cn_thread, tcnp->cn_cred); + if (err) + goto out; + } sx_xlock(&data->rename_lock); err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp); if (err == 0) { @@ -1515,8 +1678,6 @@ } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); - if (err == 0) - fuse_internal_vnode_disappear(vp); return err; } @@ -1535,129 +1696,137 @@ struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; + struct mount *mp; + struct fuse_data *data; + struct vattr old_va; + int dataflags; + int err = 0, err2; + accmode_t accmode = 0; + bool checkperm; + bool drop_suid = false; + gid_t cr_gid; - struct fuse_dispatcher fdi; - struct fuse_setattr_in *fsai; - struct fuse_access_param facp; + mp = vnode_mount(vp); + data = fuse_get_mpdata(mp); + dataflags = data->dataflags; + checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS; + if (cred->cr_ngroups > 0) + cr_gid = cred->cr_groups[0]; + else + cr_gid = 0; - int err = 0; - enum vtype vtyp; - int sizechanged = 0; - uint64_t newsize = 0; - if (fuse_isdeadfs(vp)) { return ENXIO; } - fdisp_init(&fdi, sizeof(*fsai)); - fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred); - fsai = fdi.indata; - fsai->valid = 0; - bzero(&facp, sizeof(facp)); - - facp.xuid = vap->va_uid; - facp.xgid = vap->va_gid; - if (vap->va_uid != (uid_t)VNOVAL) { - facp.facc_flags |= FACCESS_CHOWN; - fsai->uid = vap->va_uid; - fsai->valid |= FATTR_UID; + if (checkperm) { + /* Only root may change a file's owner */ + err = priv_check_cred(cred, PRIV_VFS_CHOWN); + if (err) { + /* As a special case, allow the null chown */ + err2 = fuse_internal_getattr(vp, &old_va, cred, + td); + if (err2) + return (err2); + if (vap->va_uid != old_va.va_uid) + return err; + else + accmode |= VADMIN; + drop_suid = true; + } else + accmode |= VADMIN; + } else + accmode |= VADMIN; } if (vap->va_gid != (gid_t)VNOVAL) { - facp.facc_flags |= FACCESS_CHOWN; - fsai->gid = vap->va_gid; - fsai->valid |= FATTR_GID; + if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN)) + drop_suid = true; + if (checkperm && !groupmember(vap->va_gid, cred)) + { + /* + * Non-root users may only chgrp to one of their own + * groups + */ + err = priv_check_cred(cred, PRIV_VFS_CHOWN); + if (err) { + /* As a special case, allow the null chgrp */ + err2 = fuse_internal_getattr(vp, &old_va, cred, + td); + if (err2) + return (err2); + if (vap->va_gid != old_va.va_gid) + return err; + accmode |= VADMIN; + } else + accmode |= VADMIN; + } else + accmode |= VADMIN; } if (vap->va_size != VNOVAL) { - - struct fuse_filehandle *fufh = NULL; - - /*Truncate to a new value. */ - fsai->size = vap->va_size; - sizechanged = 1; - newsize = vap->va_size; - fsai->valid |= FATTR_SIZE; - - fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh); - if (fufh) { - fsai->fh = fufh->fh_id; - fsai->valid |= FATTR_FH; + switch (vp->v_type) { + case VDIR: + return (EISDIR); + case VLNK: + case VREG: + if (vfs_isrdonly(mp)) + return (EROFS); + break; + default: + /* + * According to POSIX, the result is unspecified + * for file types other than regular files, + * directories and shared memory objects. We + * don't support shared memory objects in the file + * system, and have dubious support for truncating + * symlinks. Just ignore the request in other cases. + */ + return (0); } + /* Don't set accmode. Permission to trunc is checked upstack */ } - if (vap->va_atime.tv_sec != VNOVAL) { - fsai->atime = vap->va_atime.tv_sec; - fsai->atimensec = vap->va_atime.tv_nsec; - fsai->valid |= FATTR_ATIME; + if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { + if (vap->va_vaflags & VA_UTIMES_NULL) + accmode |= VWRITE; + else + accmode |= VADMIN; } - if (vap->va_mtime.tv_sec != VNOVAL) { - fsai->mtime = vap->va_mtime.tv_sec; - fsai->mtimensec = vap->va_mtime.tv_nsec; - fsai->valid |= FATTR_MTIME; + if (drop_suid) { + if (vap->va_mode != (mode_t)VNOVAL) + vap->va_mode &= ~(S_ISUID | S_ISGID); + else { + err = fuse_internal_getattr(vp, &old_va, cred, td); + if (err) + return (err); + vap->va_mode = old_va.va_mode & ~(S_ISUID | S_ISGID); + } } if (vap->va_mode != (mode_t)VNOVAL) { - fsai->mode = vap->va_mode & ALLPERMS; - fsai->valid |= FATTR_MODE; + /* Only root may set the sticky bit on non-directories */ + if (checkperm && vp->v_type != VDIR && (vap->va_mode & S_ISTXT) + && priv_check_cred(cred, PRIV_VFS_STICKYFILE)) + return EFTYPE; + if (checkperm && (vap->va_mode & S_ISGID)) { + err = fuse_internal_getattr(vp, &old_va, cred, td); + if (err) + return (err); + if (!groupmember(old_va.va_gid, cred)) { + err = priv_check_cred(cred, PRIV_VFS_SETGID); + if (err) + return (err); + } + } + accmode |= VADMIN; } - if (!fsai->valid) { - goto out; - } - vtyp = vnode_vtype(vp); - if (fsai->valid & FATTR_SIZE && vtyp == VDIR) { - err = EISDIR; - goto out; - } - if (vfs_isrdonly(vnode_mount(vp)) && (fsai->valid & ~FATTR_SIZE || vtyp == VREG)) { - err = EROFS; - goto out; - } - if (fsai->valid & ~FATTR_SIZE) { - /*err = fuse_internal_access(vp, VADMIN, context, &facp); */ - /*XXX */ - err = 0; - } - facp.facc_flags &= ~FACCESS_XQUERIES; + if (vfs_isrdonly(mp)) + return EROFS; - if (err && !(fsai->valid & ~(FATTR_ATIME | FATTR_MTIME)) && - vap->va_vaflags & VA_UTIMES_NULL) { - err = fuse_internal_access(vp, VWRITE, &facp, td, cred); - } + err = fuse_internal_access(vp, accmode, td, cred); if (err) - goto out; - if ((err = fdisp_wait_answ(&fdi))) - goto out; - vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode); - - if (vnode_vtype(vp) != vtyp) { - if (vnode_vtype(vp) == VNON && vtyp != VNON) { - SDT_PROBE2(fuse, , vnops, trace, 1, "FUSE: Dang! " - "vnode_vtype is VNON and vtype isn't."); - } else { - /* - * STALE vnode, ditch - * - * The vnode has changed its type "behind our back". - * There's nothing really we can do, so let us just - * force an internal revocation and tell the caller to - * try again, if interested. - */ - fuse_internal_vnode_disappear(vp); - err = EAGAIN; - } - } - if (err == 0) { - struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ; - fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, - fao->attr_valid_nsec, NULL); - } - -out: - fdisp_destroy(&fdi); - if (!err && sizechanged) { - fuse_vnode_setsize(vp, newsize); - VTOFUD(vp)->flag &= ~FN_SIZECHANGE; - } - return err; + return err; + else + return fuse_internal_setattr(vp, vap, td, cred); } /* @@ -1676,22 +1845,15 @@ bp->b_ioflags |= BIO_ERROR; bp->b_error = ENXIO; bufdone(bp); - return ENXIO; + return 0; } - if (bp->b_iocmd == BIO_WRITE) - fuse_vnode_refreshsize(vp, NOCRED); - (void)fuse_io_strategy(vp, bp); - /* - * This is a dangerous function. If returns error, that might mean a - * panic. We prefer pretty much anything over being forced to panic - * by a malicious daemon (a demon?). So we just return 0 anyway. You - * should never mind this: this function has its own error - * propagation mechanism via the argument buffer, so - * not-that-melodramatic residents of the call chain still will be - * able to know what to do. + * VOP_STRATEGY always returns zero and signals error via bp->b_ioflags. + * fuse_io_strategy sets bp's error fields */ + (void)fuse_io_strategy(vp, bp); + return 0; } @@ -1757,237 +1919,70 @@ struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; + pid_t pid = curthread->td_proc->p_pid; if (fuse_isdeadfs(vp)) { return ENXIO; } - fuse_vnode_refreshsize(vp, cred); if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } - return fuse_io_dispatch(vp, uio, ioflag, cred); + return fuse_io_dispatch(vp, uio, ioflag, cred, pid); } -SDT_PROBE_DEFINE1(fuse, , vnops, vnop_getpages_error, "int"); -/* - struct vnop_getpages_args { - struct vnode *a_vp; - vm_page_t *a_m; - int a_count; - int a_reqpage; - }; -*/ -static int -fuse_vnop_getpages(struct vop_getpages_args *ap) +static daddr_t +fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { - int i, error, nextoff, size, toff, count, npages; - struct uio uio; - struct iovec iov; - vm_offset_t kva; - struct buf *bp; - struct vnode *vp; - struct thread *td; - struct ucred *cred; - vm_page_t *pages; + const int biosize = fuse_iosize(vp); - vp = ap->a_vp; - KASSERT(vp->v_object, ("objectless vp passed to getpages")); - td = curthread; /* XXX */ - cred = curthread->td_ucred; /* XXX */ - pages = ap->a_m; - npages = ap->a_count; + return (off / biosize); +} - if (!fsess_opt_mmap(vnode_mount(vp))) { - SDT_PROBE2(fuse, , vnops, trace, 1, - "called on non-cacheable vnode??\n"); - return (VM_PAGER_ERROR); - } +static int +fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn) +{ + off_t filesize; + int blksz, err; + const int biosize = fuse_iosize(vp); - /* - * If the last page is partially valid, just return it and allow - * the pager to zero-out the blanks. Partially valid pages can - * only occur at the file EOF. - * - * XXXGL: is that true for FUSE, which is a local filesystem, - * but still somewhat disconnected from the kernel? - */ - VM_OBJECT_WLOCK(vp->v_object); - if (pages[npages - 1]->valid != 0 && --npages == 0) - goto out; - VM_OBJECT_WUNLOCK(vp->v_object); + err = fuse_vnode_size(vp, &filesize, NULL, NULL); + KASSERT(err == 0, ("vfs_bio_getpages can't handle errors here")); + if (err) + return biosize; - /* - * We use only the kva address for the buffer, but this is extremely - * convenient and fast. - */ - bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK); - - kva = (vm_offset_t)bp->b_data; - pmap_qenter(kva, pages, npages); - VM_CNT_INC(v_vnodein); - VM_CNT_ADD(v_vnodepgsin, npages); - - count = npages << PAGE_SHIFT; - iov.iov_base = (caddr_t)kva; - iov.iov_len = count; - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); - uio.uio_resid = count; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = UIO_READ; - uio.uio_td = td; - - error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); - pmap_qremove(kva, npages); - - uma_zfree(fuse_pbuf_zone, bp); - - if (error && (uio.uio_resid == count)) { - SDT_PROBE1(fuse, , vnops, vnop_getpages_error, error); - return VM_PAGER_ERROR; + if ((off_t)lbn * biosize >= filesize) { + blksz = 0; + } else if ((off_t)(lbn + 1) * biosize > filesize) { + blksz = filesize - (off_t)lbn *biosize; + } else { + blksz = biosize; } - /* - * Calculate the number of bytes read and validate only that number - * of bytes. Note that due to pending writes, size may be 0. This - * does not mean that the remaining data is invalid! - */ - - size = count - uio.uio_resid; - VM_OBJECT_WLOCK(vp->v_object); - fuse_vm_page_lock_queues(); - for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { - vm_page_t m; - - nextoff = toff + PAGE_SIZE; - m = pages[i]; - - if (nextoff <= size) { - /* - * Read operation filled an entire page - */ - m->valid = VM_PAGE_BITS_ALL; - KASSERT(m->dirty == 0, - ("fuse_getpages: page %p is dirty", m)); - } else if (size > toff) { - /* - * Read operation filled a partial page. - */ - m->valid = 0; - vm_page_set_valid_range(m, 0, size - toff); - KASSERT(m->dirty == 0, - ("fuse_getpages: page %p is dirty", m)); - } else { - /* - * Read operation was short. If no error occurred - * we may have hit a zero-fill section. We simply - * leave valid set to 0. - */ - ; - } - } - fuse_vm_page_unlock_queues(); -out: - VM_OBJECT_WUNLOCK(vp->v_object); - if (ap->a_rbehind) - *ap->a_rbehind = 0; - if (ap->a_rahead) - *ap->a_rahead = 0; - return (VM_PAGER_OK); + return (blksz); } /* - struct vnop_putpages_args { + struct vnop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; - int a_sync; - int *a_rtvals; - vm_ooffset_t a_offset; + int a_reqpage; }; */ static int -fuse_vnop_putpages(struct vop_putpages_args *ap) +fuse_vnop_getpages(struct vop_getpages_args *ap) { - struct uio uio; - struct iovec iov; - vm_offset_t kva; - struct buf *bp; - int i, error, npages, count; - off_t offset; - int *rtvals; - struct vnode *vp; - struct thread *td; - struct ucred *cred; - vm_page_t *pages; - vm_ooffset_t fsize; + struct vnode *vp = ap->a_vp; - vp = ap->a_vp; - KASSERT(vp->v_object, ("objectless vp passed to putpages")); - fsize = vp->v_object->un_pager.vnp.vnp_size; - td = curthread; /* XXX */ - cred = curthread->td_ucred; /* XXX */ - pages = ap->a_m; - count = ap->a_count; - rtvals = ap->a_rtvals; - npages = btoc(count); - offset = IDX_TO_OFF(pages[0]->pindex); - if (!fsess_opt_mmap(vnode_mount(vp))) { - SDT_PROBE2(fuse, , vnops, trace, 1, + SDT_PROBE2(fusefs, , vnops, trace, 1, "called on non-cacheable vnode??\n"); + return (VM_PAGER_ERROR); } - for (i = 0; i < npages; i++) - rtvals[i] = VM_PAGER_AGAIN; - /* - * When putting pages, do not extend file past EOF. - */ - - if (offset + count > fsize) { - count = fsize - offset; - if (count < 0) - count = 0; - } - /* - * We use only the kva address for the buffer, but this is extremely - * convenient and fast. - */ - bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK); - - kva = (vm_offset_t)bp->b_data; - pmap_qenter(kva, pages, npages); - VM_CNT_INC(v_vnodeout); - VM_CNT_ADD(v_vnodepgsout, count); - - iov.iov_base = (caddr_t)kva; - iov.iov_len = count; - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = offset; - uio.uio_resid = count; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = UIO_WRITE; - uio.uio_td = td; - - error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); - - pmap_qremove(kva, npages); - uma_zfree(fuse_pbuf_zone, bp); - - if (!error) { - int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; - - for (i = 0; i < nwritten; i++) { - rtvals[i] = VM_PAGER_OK; - VM_OBJECT_WLOCK(pages[i]->object); - vm_page_undirty(pages[i]); - VM_OBJECT_WUNLOCK(pages[i]->object); - } - } - return rtvals[0]; + return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz)); } static const char extattr_namespace_separator = '.'; @@ -2023,6 +2018,13 @@ if (fuse_isdeadfs(vp)) return (ENXIO); + if (!fsess_isimpl(mp, FUSE_GETXATTR)) + return EOPNOTSUPP; + + err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); + if (err) + return err; + /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; @@ -2053,8 +2055,10 @@ err = fdisp_wait_answ(&fdi); if (err != 0) { - if (err == ENOSYS) + if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_GETXATTR); + err = EOPNOTSUPP; + } goto out; } @@ -2100,6 +2104,29 @@ if (fuse_isdeadfs(vp)) return (ENXIO); + if (!fsess_isimpl(mp, FUSE_SETXATTR)) + return EOPNOTSUPP; + + if (vfs_isrdonly(mp)) + return EROFS; + + /* Deleting xattrs must use VOP_DELETEEXTATTR instead */ + if (ap->a_uio == NULL) { + /* + * If we got here as fallback from VOP_DELETEEXTATTR, then + * return EOPNOTSUPP. + */ + if (!fsess_isimpl(mp, FUSE_REMOVEXATTR)) + return (EOPNOTSUPP); + else + return (EINVAL); + } + + err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, + VWRITE); + if (err) + return err; + /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; @@ -2127,11 +2154,14 @@ err = fdisp_wait_answ(&fdi); - if (err != 0) { - if (err == ENOSYS) - fsess_set_notimpl(mp, FUSE_SETXATTR); - goto out; + if (err == ENOSYS) { + fsess_set_notimpl(mp, FUSE_SETXATTR); + err = EOPNOTSUPP; } + if (err == ERESTART) { + /* Can't restart after calling uiomove */ + err = EINTR; + } out: fdisp_destroy(&fdi); @@ -2227,6 +2257,13 @@ if (fuse_isdeadfs(vp)) return (ENXIO); + if (!fsess_isimpl(mp, FUSE_LISTXATTR)) + return EOPNOTSUPP; + + err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); + if (err) + return err; + /* * Add space for a NUL and the period separator if enabled. * Default to looking for user attributes. @@ -2251,8 +2288,10 @@ err = fdisp_wait_answ(&fdi); if (err != 0) { - if (err == ENOSYS) + if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LISTXATTR); + err = EOPNOTSUPP; + } goto out; } @@ -2267,7 +2306,7 @@ /* * Retrieve Linux / FUSE compatible list values. */ - fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); + fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); list_xattr_in = fdi.indata; list_xattr_in->size = linux_list_len + sizeof(*list_xattr_out); attr_str = (char *)fdi.indata + sizeof(*list_xattr_in); @@ -2330,6 +2369,17 @@ if (fuse_isdeadfs(vp)) return (ENXIO); + if (!fsess_isimpl(mp, FUSE_REMOVEXATTR)) + return EOPNOTSUPP; + + if (vfs_isrdonly(mp)) + return EROFS; + + err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, + VWRITE); + if (err) + return err; + /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; @@ -2347,9 +2397,9 @@ ap->a_name); err = fdisp_wait_answ(&fdi); - if (err != 0) { - if (err == ENOSYS) - fsess_set_notimpl(mp, FUSE_REMOVEXATTR); + if (err == ENOSYS) { + fsess_set_notimpl(mp, FUSE_REMOVEXATTR); + err = EOPNOTSUPP; } fdisp_destroy(&fdi); @@ -2373,3 +2423,48 @@ return 0; } + +/* + * Get an NFS filehandle for a FUSE file. + * + * This will only work for FUSE file systems that guarantee the uniqueness of + * nodeid:generation, which most don't. + */ +/* +vop_vptofh { + IN struct vnode *a_vp; + IN struct fid *a_fhp; +}; +*/ +static int +fuse_vnop_vptofh(struct vop_vptofh_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_fid *fhp = (struct fuse_fid *)(ap->a_fhp); + _Static_assert(sizeof(struct fuse_fid) <= sizeof(struct fid), + "FUSE fid type is too big"); + struct mount *mp = vnode_mount(vp); + struct fuse_data *data = fuse_get_mpdata(mp); + struct vattr va; + int err; + + if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) + return EOPNOTSUPP; + + err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread); + if (err) + return err; + + /*ip = VTOI(ap->a_vp);*/ + /*ufhp = (struct ufid *)ap->a_fhp;*/ + fhp->len = sizeof(struct fuse_fid); + fhp->nid = fvdat->nid; + if (fvdat->generation <= UINT32_MAX) + fhp->gen = fvdat->generation; + else + return EOVERFLOW; + return (0); +} + + Index: sys/kern/kern_sig.c =================================================================== --- sys/kern/kern_sig.c +++ sys/kern/kern_sig.c @@ -929,6 +929,23 @@ #endif #endif /* COMPAT_43 */ +/* Would this signal be fatal to the current process, if it were caught ? */ +bool +sig_isfatal(struct proc *p, int sig) +{ + intptr_t act; + int prop; + + mtx_assert(&p->p_sigacts->ps_mtx, MA_OWNED); + act = (intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]; + if ((intptr_t)SIG_DFL == act) { + prop = sigprop(sig); + return (0 != (prop & (SIGPROP_KILL | SIGPROP_CORE))); + } else { + return (false); + } +} + /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. Index: sys/kern/vfs_cache.c =================================================================== --- sys/kern/vfs_cache.c +++ sys/kern/vfs_cache.c @@ -1964,7 +1964,7 @@ } /* - * Invalidate all entries to a particular vnode. + * Invalidate all entries from and to a particular vnode. */ void cache_purge(struct vnode *vp) Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -118,6 +118,8 @@ static void vfs_knl_assert_unlocked(void *arg); static void vnlru_return_batches(struct vfsops *mnt_op); static void destroy_vpollinfo(struct vpollinfo *vi); +static int v_inval_buf_range1(struct vnode *vp, struct bufobj *bo, + daddr_t startlbn, daddr_t endlbn); /* * These fences are intended for cases where some synchronization is @@ -945,6 +947,12 @@ * desirable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. + * + * @param mp Try to reclaim vnodes from this mountpoint + * @param reclaim_nc_src Only reclaim directories with outgoing namecache + * entries if this argument is strue + * @param reclaim_free Only reclaim free vnodes if this is set. + * @return The number of vnodes that were reclaimed. */ static int vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger) @@ -1954,9 +1962,8 @@ vtruncbuf(struct vnode *vp, off_t length, int blksize) { struct buf *bp, *nbp; - int anyfreed; - daddr_t trunclbn; struct bufobj *bo; + daddr_t startlbn; CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, vp, blksize, (uintmax_t)length); @@ -1964,22 +1971,114 @@ /* * Round up to the *next* lbn. */ - trunclbn = howmany(length, blksize); + startlbn = howmany(length, blksize); ASSERT_VOP_LOCKED(vp, "vtruncbuf"); + restart: bo = &vp->v_bufobj; BO_LOCK(bo); + if (v_inval_buf_range1(vp, bo, startlbn, INT64_MAX) == EAGAIN) + goto restart; + + if (length > 0) { +restartsync: + TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { + if (bp->b_lblkno > 0) + continue; + /* + * Since we hold the vnode lock this should only + * fail if we're racing with the buf daemon. + */ + if (BUF_LOCK(bp, + LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, + BO_LOCKPTR(bo)) == ENOLCK) { + goto restart; + } + VNASSERT((bp->b_flags & B_DELWRI), vp, + ("buf(%p) on dirty queue without DELWRI", bp)); + + bremfree(bp); + bawrite(bp); + BO_LOCK(bo); + goto restartsync; + } + } + + bufobj_wwait(bo, 0, 0); + BO_UNLOCK(bo); + vnode_pager_setsize(vp, length); + + return (0); +} + +/* + * Invalidate the cached pages of a file's buffer within the range of block + * numbers [startlbn, endlbn). Every buffer that overlaps that range will be + * invalidated. This must not result in any dirty data being lost. + */ +void +v_inval_buf_range(struct vnode *vp, off_t start, off_t end, int blksize) +{ + struct bufobj *bo; + daddr_t startlbn, endlbn; + vm_pindex_t startp, endp; + + /* Round "outwards" */ + startlbn = start / blksize; + endlbn = howmany(end, blksize); + startp = OFF_TO_IDX(start); + endp = OFF_TO_IDX(end + PAGE_SIZE - 1); + + ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); + +restart: + bo = &vp->v_bufobj; + BO_LOCK(bo); + +#ifdef INVARIANTS + struct buf *bp, *nbp; + + TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { + /* + * Disallow invalidating dirty data outside of the requested + * offsets. Assume that data within the requested offsets is + * being invalidated for a good reason. + */ + off_t blkstart, blkend; + + blkstart = bp->b_offset; + blkend = bp->b_offset + bp->b_bcount; + KASSERT(blkstart >= start && blkend <= end, + ("Invalidating extra dirty data!")); + } +#endif + + if (v_inval_buf_range1(vp, bo, startlbn, endlbn) == EAGAIN) + goto restart; + + BO_UNLOCK(bo); + vn_pages_remove(vp, startp, endp); +} + +/* Like v_inval_buf_range, but operates on whole buffers instead of offsets */ +static int +v_inval_buf_range1(struct vnode *vp, struct bufobj *bo, + daddr_t startlbn, daddr_t endlbn) +{ + struct buf *bp, *nbp; + int anyfreed; + anyfreed = 1; for (;anyfreed;) { anyfreed = 0; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { - if (bp->b_lblkno < trunclbn) + if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) - goto restart; + return EAGAIN; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); @@ -1993,17 +2092,17 @@ (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { BO_UNLOCK(bo); - goto restart; + return EAGAIN; } } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { - if (bp->b_lblkno < trunclbn) + if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) - goto restart; + return EAGAIN; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; @@ -2016,40 +2115,11 @@ (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { BO_UNLOCK(bo); - goto restart; + return EAGAIN; } } } - - if (length > 0) { -restartsync: - TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { - if (bp->b_lblkno > 0) - continue; - /* - * Since we hold the vnode lock this should only - * fail if we're racing with the buf daemon. - */ - if (BUF_LOCK(bp, - LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, - BO_LOCKPTR(bo)) == ENOLCK) { - goto restart; - } - VNASSERT((bp->b_flags & B_DELWRI), vp, - ("buf(%p) on dirty queue without DELWRI", bp)); - - bremfree(bp); - bawrite(bp); - BO_LOCK(bo); - goto restartsync; - } - } - - bufobj_wwait(bo, 0, 0); - BO_UNLOCK(bo); - vnode_pager_setsize(vp, length); - - return (0); + return 0; } static void Index: sys/sys/signalvar.h =================================================================== --- sys/sys/signalvar.h +++ sys/sys/signalvar.h @@ -384,6 +384,7 @@ void sigexit(struct thread *td, int sig) __dead2; int sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **); int sig_ffs(sigset_t *set); +bool sig_isfatal(struct proc *p, int sig); void siginit(struct proc *p); void signotify(struct thread *td); void sigqueue_delete(struct sigqueue *queue, int sig); Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -659,6 +659,8 @@ void vinactive(struct vnode *, struct thread *); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, off_t length, int blksize); +void v_inval_buf_range(struct vnode *vp, off_t start, off_t end, + int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); int vrecycle(struct vnode *vp); Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -464,8 +464,7 @@ * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nobjsize < object->size) - vm_object_page_remove(object, nobjsize, object->size, - 0); + vm_object_page_remove(object, nobjsize, 0, 0); /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode. Index: tests/sys/fs/Makefile =================================================================== --- tests/sys/fs/Makefile +++ tests/sys/fs/Makefile @@ -1,5 +1,7 @@ # $FreeBSD$ +.include + PACKAGE= tests TESTSDIR= ${TESTSBASE}/sys/fs @@ -7,6 +9,9 @@ TESTSRC= ${SRCTOP}/contrib/netbsd-tests/fs #TESTS_SUBDIRS+= nullfs # XXX: needs rump +.if ${COMPILER_FEATURES:Mc++14} +TESTS_SUBDIRS+= fusefs +.endif TESTS_SUBDIRS+= tmpfs ${PACKAGE}FILES+= h_funcs.subr