Index: MAINTAINERS =================================================================== --- MAINTAINERS +++ MAINTAINERS @@ -53,6 +53,7 @@ etc/mail gshapiro Pre-commit review requested. Keep in sync with -STABLE. etc/sendmail gshapiro Pre-commit review requested. Keep in sync with -STABLE. fetch des Pre-commit review requested, email only. +fusefs(5) asomers Pre-commit review requested. geli pjd Pre-commit review requested (both sys/geom/eli/ and sbin/geom/class/eli/). isci(4) jimharris Pre-commit review requested. iwm(4) adrian Pre-commit review requested, send to freebsd-wireless@freebsd.org Index: UPDATING =================================================================== --- UPDATING +++ UPDATING @@ -26,6 +26,18 @@ disable the most expensive debugging functionality run "ln -s 'abort:false,junk:false' /etc/malloc.conf".) +20190727: + The vfs.fusefs.sync_unmount and vfs.fusefs.init_backgrounded sysctls + and the "-o sync_unmount" and "-o init_backgrounded" mount options have + been removed from mount_fusefs(8). You can safely remove them from + your scripts, because they had no effect. + + The vfs.fusefs.fix_broken_io, vfs.fusefs.sync_resize, + vfs.fusefs.refresh_size, vfs.fusefs.mmap_enable, + vfs.fusefs.reclaim_revoked, and vfs.fusefs.data_cache_invalidate + sysctls have been removed. If you felt the need to set any of them to + a non-default value, please tell asomers@FreeBSD.org why. + 20190713: Default permissions on the /var/account/acct file (and copies of it rotated by periodic daily scripts) are changed from 0644 to 0640 Index: etc/mtree/BSD.tests.dist =================================================================== --- etc/mtree/BSD.tests.dist +++ etc/mtree/BSD.tests.dist @@ -731,6 +731,8 @@ file .. fs + fusefs + .. tmpfs .. .. Index: sbin/mount_fusefs/mount_fusefs.8 =================================================================== --- sbin/mount_fusefs/mount_fusefs.8 +++ sbin/mount_fusefs/mount_fusefs.8 @@ -3,6 +3,11 @@ .\" Copyright (c) 2005, 2006 Csaba Henk .\" All rights reserved. .\" +.\" Copyright (c) 2019 The FreeBSD Foundation +.\" +.\" Portions of this documentation were written by BFF Storage Systems under +.\" sponsorship from the FreeBSD Foundation. +.\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: @@ -29,7 +34,7 @@ .\" .\" $FreeBSD$ .\" -.Dd November 17, 2018 +.Dd July 31, 2019 .Dt MOUNT_FUSEFS 8 .Os .Sh NAME @@ -108,27 +113,27 @@ .Xr sudoers 5 file. .It Fl S , Ic --safe -Run in safe mode (i.e. reject invoking a filesystem daemon) +Run in safe mode (i.e., reject invoking a filesystem daemon). .It Fl v -Be verbose -.It Fl D, Ic --daemon Ar daemon +Be verbose. +.It Fl D , Ic --daemon Ar daemon Call the specified -.Ar daemon -.It Fl O, Ic --daemon_opts Ar opts +.Ar daemon . +.It Fl O , Ic --daemon_opts Ar opts Add .Ar opts -to the daemon's command line -.It Fl s, Ic --special Ar special +to the daemon's command line. +.It Fl s , Ic --special Ar special Use .Ar special -as special -.It Fl m, Ic --mountpath Ar node +as special. +.It Fl m , Ic --mountpath Ar node Mount on -.Ar node -.It Fl h, Ic --help -Show help -.It Fl V, Ic --version -Show version information +.Ar node . +.It Fl h , Ic --help +Show help. +.It Fl V , Ic --version +Show version information. .It Fl o Mount options are specified via .Fl o . @@ -136,23 +141,38 @@ by prefixing them with .Dq no ) : .Bl -tag -width indent -.It Cm default_permissions -Enable traditional (file mode based) permission checking in kernel .It Cm allow_other Do not apply .Sx STRICT ACCESS POLICY . -Only root can use this option +Only root can use this option. +.It Cm async +I/O to the file system may be done asynchronously. +Writes may be delayed and/or reordered. +.It Cm default_permissions +Enable traditional (file mode based) permission checking in kernel. +.It Cm intr +Allow signals to interrupt operations that are blocked waiting for a reply from the server. +When this option is in use, system calls may fail with +.Er EINTR +whenever a signal is received. .It Cm max_read Ns = Ns Ar n Limit size of read requests to -.Ar n +.Ar n . +.It Cm neglect_shares +Do not refuse unmounting if there are secondary mounts. .It Cm private Refuse shared mounting of the daemon. This is the default behaviour, to allow sharing, expicitly use -.Fl o Cm noprivate -.It Cm neglect_shares -Do not refuse unmounting if there are secondary mounts +.Fl o Cm noprivate . .It Cm push_symlinks_in -Prefix absolute symlinks with the mountpoint +Prefix absolute symlinks with the mountpoint. +.It Cm subtype Ns = Ns Ar fsname +Suffix +.Ar fsname +to the file system name as reported by +.Xr statfs 2 . +This option can be used to identify the file system implemented by +.Ar fuse_daemon . .El .El .Pp @@ -167,11 +187,11 @@ Currently the options supported by the kernel are: .Bl -tag -width indent .It Cm direct_io -Bypass the buffer cache system +Bypass the buffer cache system. .It Cm kernel_cache By default cached buffers of a given file are flushed at each .Xr open 2 . -This option disables this behaviour +This option disables this behaviour. .El .Sh DAEMON MOUNTS Usually users do not need to use @@ -194,7 +214,7 @@ real gid) as the user. .Pp This is applied for Fuse mounts by default and only root can mount without -the strict access policy (i.e. the +the strict access policy (i.e., the .Cm allow_other mount option). .Pp @@ -206,7 +226,7 @@ are concerned) by doing their own secondary mount (See .Sx SHARED MOUNTS ) . .Sh SHARED MOUNTS -A Fuse daemon can be shared (i.e. mounted multiple times). +A Fuse daemon can be shared (i.e., mounted multiple times). When doing the first (primary) mount, the spawner and the mounter of the daemon must have the same uid, or the mounter should be the superuser. .Pp @@ -225,7 +245,7 @@ .Pp The device name of a secondary mount is the device name of the corresponding primary mount, followed by a '#' character and the index of the secondary -mount; e.g. +mount; e.g., .Pa /dev/fuse0#3 . .Sh SECURITY System administrators might want to use a custom mount policy (ie., one going @@ -239,7 +259,7 @@ is capable of invoking an arbitrary program, one must be careful when doing this. .Nm is designed in a way such that it makes that easy. -For this purpose, there are options which disable certain risky features (i.e. +For this purpose, there are options which disable certain risky features ( .Fl S and .Fl A ) , @@ -342,7 +362,7 @@ was written as the part of the .Fx implementation of the Fuse userspace filesystem framework (see -.Xr https://github.com/libfuse/libfuse ) +.Lk https://github.com/libfuse/libfuse ) and first appeared in the .Pa sysutils/fusefs-kmod port, supporting Index: sbin/mount_fusefs/mount_fusefs.c =================================================================== --- sbin/mount_fusefs/mount_fusefs.c +++ sbin/mount_fusefs/mount_fusefs.c @@ -5,6 +5,11 @@ * Copyright (c) 2005 Csaba Henk * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -60,7 +65,6 @@ void usage(void); void helpmsg(void); void showversion(void); -int init_backgrounded(void); static struct mntopt mopts[] = { #define ALTF_PRIVATE 0x01 @@ -73,8 +77,6 @@ { "max_read=", 0, ALTF_MAXREAD, 1 }, #define ALTF_SUBTYPE 0x40 { "subtype=", 0, ALTF_SUBTYPE, 1 }, - #define ALTF_SYNC_UNMOUNT 0x80 - { "sync_unmount", 0, ALTF_SYNC_UNMOUNT, 1 }, /* * MOPT_AUTOMOUNTED, included by MOPT_STDOPTS, does not fit into * the 'flags' argument to nmount(2). We have to abuse altflags @@ -82,6 +84,8 @@ */ #define ALTF_AUTOMOUNTED 0x100 { "automounted", 0, ALTF_AUTOMOUNTED, 1 }, + #define ALTF_INTR 0x200 + { "intr", 0, ALTF_INTR, 1 }, /* Linux specific options, we silently ignore them */ { "fsname=", 0, 0x00, 1 }, { "fd=", 0, 0x00, 1 }, @@ -91,6 +95,8 @@ { "large_read", 0, 0x00, 1 }, /* "nonempty", just the first two chars are stripped off during parsing */ { "nempty", 0, 0x00, 1 }, + { "async", 0, MNT_ASYNC, 0}, + { "noasync", 1, MNT_ASYNC, 0}, MOPT_STDOPTS, MOPT_END }; @@ -107,7 +113,7 @@ { 0, NULL, 0 } }; -#define DEFAULT_MOUNT_FLAGS ALTF_PRIVATE | ALTF_SYNC_UNMOUNT +#define DEFAULT_MOUNT_FLAGS ALTF_PRIVATE int main(int argc, char *argv[]) @@ -409,12 +415,6 @@ } } - if (fd >= 0 && ! init_backgrounded() && close(fd) < 0) { - if (pid) - kill(pid, SIGKILL); - err(1, "failed to close fuse device"); - } - /* Prepare the options vector for nmount(). build_iovec() is declared * in mntopts.h. */ sprintf(fdstr, "%d", fd); @@ -471,6 +471,7 @@ " -o allow_other allow access to other users\n" /* " -o nonempty allow mounts over non-empty file/dir\n" */ " -o default_permissions enable permission checking by kernel\n" + " -o intr interruptible mount\n" /* " -o fsname=NAME set filesystem name\n" " -o large_read issue large read requests (2.4 only)\n" @@ -481,7 +482,6 @@ " -o neglect_shares don't report EBUSY when unmount attempted\n" " in presence of secondary mounts\n" " -o push_symlinks_in prefix absolute symlinks with mountpoint\n" - " -o sync_unmount do unmount synchronously\n" ); exit(EX_USAGE); } @@ -491,18 +491,4 @@ { puts("mount_fusefs [fuse4bsd] version: " FUSE4BSD_VERSION); exit(EX_USAGE); -} - -int -init_backgrounded(void) -{ - int ibg; - size_t len; - - len = sizeof(ibg); - - if (sysctlbyname("vfs.fusefs.init_backgrounded", &ibg, &len, NULL, 0)) - return (0); - - return (ibg); } Index: share/man/man5/fusefs.5 =================================================================== --- share/man/man5/fusefs.5 +++ share/man/man5/fusefs.5 @@ -3,8 +3,8 @@ .\" .\" Copyright (c) 2019 The FreeBSD Foundation .\" -.\" This software was developed by BFF Storage Systems, LLC under sponsorship -.\" from the FreeBSD Foundation. +.\" This documentation was written by BFF Storage Systems, LLC under +.\" sponsorship from the FreeBSD Foundation. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions @@ -28,7 +28,7 @@ .\" SUCH DAMAGE. .\" .\" $FreeBSD$ -.Dd April 13, 2019 +.Dd July 31, 2019 .Dt FUSEFS 5 .Os .Sh NAME @@ -60,11 +60,9 @@ API is portable. Many daemons can run on multiple operating systems with minimal modifications. .Sh SYSCTL VARIABLES -The following variables are available as both +The following .Xr sysctl 8 -variables and -.Xr loader 8 -tunables: +variables are available: .Bl -tag -width indent .It Va vfs.fusefs.kernelabi_major Major version of the FUSE kernel ABI supported by this driver. @@ -73,7 +71,7 @@ .It Va vfs.fusefs.data_cache_mode Controls how .Nm -will cache file data. +will cache file data for pre-7.23 file systems. A value of 0 will disable caching entirely. Every data access will be forwarded to the daemon. A value of 1 will select write-through caching. @@ -84,33 +82,26 @@ to the daemon by the page daemon. Write-back caching is usually unsafe, especially for FUSE file systems that require network access. -.It Va vfs.fusefs.lookup_cache_enable -Controls whether -.Nm -will cache lookup responses from the file system. -FUSE file systems indicate whether lookup responses should be cacheable, but -it may be useful to globally disable caching them if a file system is -misbehaving. +.Pp +FUSE file systems using protocol 7.23 or later specify their cache behavior +on a per-mountpoint basis, ignoring this sysctl. +.It Va vfs.fusefs.stats.filehandle_count +Current number of open FUSE file handles. +.It Va vfs.fusefs.stats.lookup_cache_hits +Total number of lookup cache hits. +.It Va vfs.fusefs.stats.lookup_cache_misses +Total number of lookup cache misses. +.It Va vfs.fusefs.stats.node_count +Current number of allocated FUSE vnodes. +.It Va vfs.fusefs.stats.ticket_count +Current number of allocated FUSE tickets, which is roughly equal to the number +of FUSE operations currently being processed by daemons. .\" Undocumented sysctls .\" ==================== -.\" Counters: I intend to rename to vfs.fusefs.stats.* for clarity -.\" vfs.fusefs.lookup_cache_{hits, misses} -.\" vfs.fusefs.filehandle_count -.\" vfs.fusefs.ticker_count -.\" vfs.fusefs.node_count -.\" -.\" vfs.fusefs.version - useless since the driver moved in-tree -.\" vfs.fusefs.reclaim_revoked: I don't understand it well-enough -.\" vfs.fusefs.sync_unmount: dead code .\" vfs.fusefs.enforce_dev_perms: I don't understand it well enough. -.\" vfs.fusefs.init_backgrounded: dead code .\" vfs.fusefs.iov_credit: I don't understand it well enough .\" vfs.fusefs.iov_permanent_bufsize: I don't understand it well enough -.\" vfs.fusefs.fix_broken_io: I don't understand it well enough -.\" vfs.fusefs.sync_resize: useless and should be removed -.\" vfs.fusefs.refresh_size: probably useless? -.\" vfs.fusefs.mmap_enable: why is this optional? -.\" vfs.fusefs.data_cache_invalidate: what is this needed for? +.El .Sh SEE ALSO .Xr mount_fusefs 8 .Sh HISTORY @@ -119,7 +110,7 @@ driver was written as the part of the .Fx implementation of the FUSE userspace file system framework (see -.Xr https://github.com/libfuse/libfuse ) +.Lk https://github.com/libfuse/libfuse ) and first appeared in the .Pa sysutils/fusefs-kmod port, supporting Index: sys/fs/fuse/fuse.h =================================================================== --- sys/fs/fuse/fuse.h +++ sys/fs/fuse/fuse.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -63,87 +68,10 @@ #define FUSE_MIN_DAEMON_TIMEOUT 0 /* s */ #define FUSE_MAX_DAEMON_TIMEOUT 600 /* s */ -#ifndef FUSE_FREEBSD_VERSION -#define FUSE_FREEBSD_VERSION "0.4.4" -#endif - -/* Mapping versions to features */ - -#define FUSE_KERNELABI_GEQ(maj, min) \ -(FUSE_KERNEL_VERSION > (maj) || (FUSE_KERNEL_VERSION == (maj) && FUSE_KERNEL_MINOR_VERSION >= (min))) - -/* - * Appearance of new FUSE operations is not always in par with version - * numbering... At least, 7.3 is a sufficient condition for having - * FUSE_{ACCESS,CREATE}. - */ -#if FUSE_KERNELABI_GEQ(7, 3) -#ifndef FUSE_HAS_ACCESS -#define FUSE_HAS_ACCESS 1 -#endif -#ifndef FUSE_HAS_CREATE -#define FUSE_HAS_CREATE 1 -#endif -#else /* FUSE_KERNELABI_GEQ(7, 3) */ -#ifndef FUSE_HAS_ACCESS -#define FUSE_HAS_ACCESS 0 -#endif -#ifndef FUSE_HAS_CREATE -#define FUSE_HAS_CREATE 0 -#endif -#endif - -#if FUSE_KERNELABI_GEQ(7, 7) -#ifndef FUSE_HAS_GETLK -#define FUSE_HAS_GETLK 1 -#endif -#ifndef FUSE_HAS_SETLK -#define FUSE_HAS_SETLK 1 -#endif -#ifndef FUSE_HAS_SETLKW -#define FUSE_HAS_SETLKW 1 -#endif -#ifndef FUSE_HAS_INTERRUPT -#define FUSE_HAS_INTERRUPT 1 -#endif -#else /* FUSE_KERNELABI_GEQ(7, 7) */ -#ifndef FUSE_HAS_GETLK -#define FUSE_HAS_GETLK 0 -#endif -#ifndef FUSE_HAS_SETLK -#define FUSE_HAS_SETLK 0 -#endif -#ifndef FUSE_HAS_SETLKW -#define FUSE_HAS_SETLKW 0 -#endif -#ifndef FUSE_HAS_INTERRUPT -#define FUSE_HAS_INTERRUPT 0 -#endif -#endif - -#if FUSE_KERNELABI_GEQ(7, 8) -#ifndef FUSE_HAS_FLUSH_RELEASE -#define FUSE_HAS_FLUSH_RELEASE 1 -/* - * "DESTROY" came in the middle of the 7.8 era, - * so this is not completely exact... - */ -#ifndef FUSE_HAS_DESTROY -#define FUSE_HAS_DESTROY 1 -#endif -#endif -#else /* FUSE_KERNELABI_GEQ(7, 8) */ -#ifndef FUSE_HAS_FLUSH_RELEASE -#define FUSE_HAS_FLUSH_RELEASE 0 -#ifndef FUSE_HAS_DESTROY -#define FUSE_HAS_DESTROY 0 -#endif -#endif -#endif - /* misc */ SYSCTL_DECL(_vfs_fusefs); +SYSCTL_DECL(_vfs_fusefs_stats); /* Fuse locking */ Index: sys/fs/fuse/fuse_device.c =================================================================== --- sys/fs/fuse/fuse_device.c +++ sys/fs/fuse/fuse_device.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -81,27 +86,28 @@ #include #include "fuse.h" +#include "fuse_internal.h" #include "fuse_ipc.h" -SDT_PROVIDER_DECLARE(fuse); +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , device, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , device, trace, "int", "char*"); static struct cdev *fuse_dev; +static d_kqfilter_t fuse_device_filter; static d_open_t fuse_device_open; -static d_close_t fuse_device_close; static d_poll_t fuse_device_poll; static d_read_t fuse_device_read; static d_write_t fuse_device_write; static struct cdevsw fuse_device_cdevsw = { + .d_kqfilter = fuse_device_filter, .d_open = fuse_device_open, - .d_close = fuse_device_close, .d_name = "fuse", .d_poll = fuse_device_poll, .d_read = fuse_device_read, @@ -109,6 +115,15 @@ .d_version = D_VERSION, }; +static int fuse_device_filt_read(struct knote *kn, long hint); +static void fuse_device_filt_detach(struct knote *kn); + +struct filterops fuse_device_rfiltops = { + .f_isfd = 1, + .f_detach = fuse_device_filt_detach, + .f_event = fuse_device_filt_read, +}; + /**************************** * * >>> Fuse device op defs @@ -119,11 +134,100 @@ fdata_dtor(void *arg) { struct fuse_data *fdata; + struct fuse_ticket *tick; fdata = arg; + if (fdata == NULL) + return; + + fdata_set_dead(fdata); + + FUSE_LOCK(); + fuse_lck_mtx_lock(fdata->aw_mtx); + /* wakup poll()ers */ + selwakeuppri(&fdata->ks_rsel, PZERO + 1); + /* Don't let syscall handlers wait in vain */ + while ((tick = fuse_aw_pop(fdata))) { + fuse_lck_mtx_lock(tick->tk_aw_mtx); + fticket_set_answered(tick); + tick->tk_aw_errno = ENOTCONN; + wakeup(tick); + fuse_lck_mtx_unlock(tick->tk_aw_mtx); + FUSE_ASSERT_AW_DONE(tick); + fuse_ticket_drop(tick); + } + fuse_lck_mtx_unlock(fdata->aw_mtx); + + /* Cleanup unsent operations */ + fuse_lck_mtx_lock(fdata->ms_mtx); + while ((tick = fuse_ms_pop(fdata))) { + fuse_ticket_drop(tick); + } + fuse_lck_mtx_unlock(fdata->ms_mtx); + FUSE_UNLOCK(); + fdata_trydestroy(fdata); } +static int +fuse_device_filter(struct cdev *dev, struct knote *kn) +{ + struct fuse_data *data; + int error; + + error = devfs_get_cdevpriv((void **)&data); + + /* EVFILT_WRITE is not supported; the device is always ready to write */ + if (error == 0 && kn->kn_filter == EVFILT_READ) { + kn->kn_fop = &fuse_device_rfiltops; + kn->kn_hook = data; + knlist_add(&data->ks_rsel.si_note, kn, 0); + error = 0; + } else if (error == 0) { + error = EINVAL; + kn->kn_data = error; + } + + return (error); +} + +static void +fuse_device_filt_detach(struct knote *kn) +{ + struct fuse_data *data; + + data = (struct fuse_data*)kn->kn_hook; + MPASS(data != NULL); + knlist_remove(&data->ks_rsel.si_note, kn, 0); + kn->kn_hook = NULL; +} + +static int +fuse_device_filt_read(struct knote *kn, long hint) +{ + struct fuse_data *data; + int ready; + + data = (struct fuse_data*)kn->kn_hook; + MPASS(data != NULL); + + mtx_assert(&data->ms_mtx, MA_OWNED); + if (fdata_get_dead(data)) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = ENODEV; + kn->kn_data = 1; + ready = 1; + } else if (STAILQ_FIRST(&data->ms_head)) { + MPASS(data->ms_count >= 1); + kn->kn_data = data->ms_count; + ready = 1; + } else { + ready = 0; + } + + return (ready); +} + /* * Resources are set up on a per-open basis */ @@ -133,52 +237,17 @@ struct fuse_data *fdata; int error; - SDT_PROBE2(fuse, , device, trace, 1, "device open"); + SDT_PROBE2(fusefs, , device, trace, 1, "device open"); fdata = fdata_alloc(dev, td->td_ucred); error = devfs_set_cdevpriv(fdata, fdata_dtor); if (error != 0) fdata_trydestroy(fdata); else - SDT_PROBE2(fuse, , device, trace, 1, "device open success"); + SDT_PROBE2(fusefs, , device, trace, 1, "device open success"); return (error); } -static int -fuse_device_close(struct cdev *dev, int fflag, int devtype, struct thread *td) -{ - struct fuse_data *data; - struct fuse_ticket *tick; - int error; - - error = devfs_get_cdevpriv((void **)&data); - if (error != 0) - return (error); - if (!data) - panic("no fuse data upon fuse device close"); - fdata_set_dead(data); - - FUSE_LOCK(); - fuse_lck_mtx_lock(data->aw_mtx); - /* wakup poll()ers */ - selwakeuppri(&data->ks_rsel, PZERO + 1); - /* Don't let syscall handlers wait in vain */ - while ((tick = fuse_aw_pop(data))) { - fuse_lck_mtx_lock(tick->tk_aw_mtx); - fticket_set_answered(tick); - tick->tk_aw_errno = ENOTCONN; - wakeup(tick); - fuse_lck_mtx_unlock(tick->tk_aw_mtx); - FUSE_ASSERT_AW_DONE(tick); - fuse_ticket_drop(tick); - } - fuse_lck_mtx_unlock(data->aw_mtx); - FUSE_UNLOCK(); - - SDT_PROBE2(fuse, , device, trace, 1, "device close"); - return (0); -} - int fuse_device_poll(struct cdev *dev, int events, struct thread *td) { @@ -219,7 +288,7 @@ int buflen[3]; int i; - SDT_PROBE2(fuse, , device, trace, 1, "fuse device read"); + SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read"); err = devfs_get_cdevpriv((void **)&data); if (err != 0) @@ -228,7 +297,7 @@ fuse_lck_mtx_lock(data->ms_mtx); again: if (fdata_get_dead(data)) { - SDT_PROBE2(fuse, , device, trace, 2, + SDT_PROBE2(fusefs, , device, trace, 2, "we know early on that reader should be kicked so we " "don't wait for news"); fuse_lck_mtx_unlock(data->ms_mtx); @@ -256,7 +325,7 @@ * -- and some other cases, too, tho not totally clear, when * (cv_signal/wakeup_one signals the whole process ?) */ - SDT_PROBE2(fuse, , device, trace, 1, "no message on thread"); + SDT_PROBE2(fusefs, , device, trace, 1, "no message on thread"); goto again; } fuse_lck_mtx_unlock(data->ms_mtx); @@ -266,9 +335,10 @@ * somebody somewhere -- eg., umount routine -- * wants this liaison finished off */ - SDT_PROBE2(fuse, , device, trace, 2, "reader is to be sacked"); + SDT_PROBE2(fusefs, , device, trace, 2, + "reader is to be sacked"); if (tick) { - SDT_PROBE2(fuse, , device, trace, 2, "weird -- " + SDT_PROBE2(fusefs, , device, trace, 2, "weird -- " "\"kick\" is set tho there is message"); FUSE_ASSERT_MS_DONE(tick); fuse_ticket_drop(tick); @@ -276,7 +346,7 @@ return (ENODEV); /* This should make the daemon get off * of us */ } - SDT_PROBE2(fuse, , device, trace, 1, + SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read message successfully"); KASSERT(tick->tk_ms_bufdata || tick->tk_ms_bufsize == 0, @@ -311,7 +381,7 @@ */ if (uio->uio_resid < buflen[i]) { fdata_set_dead(data); - SDT_PROBE2(fuse, , device, trace, 2, + SDT_PROBE2(fusefs, , device, trace, 2, "daemon is stupid, kick it off..."); err = ENODEV; break; @@ -331,23 +401,26 @@ fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio) { if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) { - SDT_PROBE2(fuse, , device, trace, 1, "Format error: body size " + SDT_PROBE2(fusefs, , device, trace, 1, + "Format error: body size " "differs from size claimed by header"); return (EINVAL); } - if (uio->uio_resid && ohead->error) { - SDT_PROBE2(fuse, , device, trace, 1, + if (uio->uio_resid && ohead->unique != 0 && ohead->error) { + SDT_PROBE2(fusefs, , device, trace, 1, "Format error: non zero error but message had a body"); return (EINVAL); } - /* Sanitize the linuxism of negative errnos */ - ohead->error = -(ohead->error); return (0); } -SDT_PROBE_DEFINE1(fuse, , device, fuse_device_write_bumped_into_callback, - "uint64_t"); +SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_notify, + "struct fuse_out_header*"); +SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_missing_ticket, + "uint64_t"); +SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_found, + "struct fuse_ticket*"); /* * fuse_device_write first reads the header sent by the daemon. * If that's OK, looks up ticket/callback node by the unique id seen in header. @@ -360,15 +433,17 @@ struct fuse_out_header ohead; int err = 0; struct fuse_data *data; - struct fuse_ticket *tick, *x_tick; + struct mount *mp; + struct fuse_ticket *tick, *itick, *x_tick; int found = 0; err = devfs_get_cdevpriv((void **)&data); if (err != 0) return (err); + mp = data->mp; if (uio->uio_resid < sizeof(struct fuse_out_header)) { - SDT_PROBE2(fuse, , device, trace, 1, + SDT_PROBE2(fusefs, , device, trace, 1, "fuse_device_write got less than a header!"); fdata_set_dead(data); return (EINVAL); @@ -393,15 +468,29 @@ fuse_lck_mtx_lock(data->aw_mtx); TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link, x_tick) { - SDT_PROBE1(fuse, , device, - fuse_device_write_bumped_into_callback, - tick->tk_unique); if (tick->tk_unique == ohead.unique) { + SDT_PROBE1(fusefs, , device, fuse_device_write_found, + tick); found = 1; fuse_aw_remove(tick); break; } } + if (found && tick->irq_unique > 0) { + /* + * Discard the FUSE_INTERRUPT ticket that tried to interrupt + * this operation + */ + TAILQ_FOREACH_SAFE(itick, &data->aw_head, tk_aw_link, + x_tick) { + if (itick->tk_unique == tick->irq_unique) { + fuse_aw_remove(itick); + fuse_ticket_drop(itick); + break; + } + } + tick->irq_unique = 0; + } fuse_lck_mtx_unlock(data->aw_mtx); if (found) { @@ -414,13 +503,15 @@ * via ticket_drop(), so no manual mucking * around...) */ - SDT_PROBE2(fuse, , device, trace, 1, + SDT_PROBE2(fusefs, , device, trace, 1, "pass ticket to a callback"); + /* Sanitize the linuxism of negative errnos */ + ohead.error *= -1; memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead)); err = tick->tk_aw_handler(tick, uio); } else { /* pretender doesn't wanna do anything with answer */ - SDT_PROBE2(fuse, , device, trace, 1, + SDT_PROBE2(fusefs, , device, trace, 1, "stuff devalidated, so we drop it"); } @@ -430,11 +521,51 @@ * because fuse_ticket_drop() will deal with refcount anyway. */ fuse_ticket_drop(tick); + } else if (ohead.unique == 0){ + /* unique == 0 means asynchronous notification */ + SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead); + switch (ohead.error) { + case FUSE_NOTIFY_INVAL_ENTRY: + err = fuse_internal_invalidate_entry(mp, uio); + break; + case FUSE_NOTIFY_INVAL_INODE: + err = fuse_internal_invalidate_inode(mp, uio); + break; + case FUSE_NOTIFY_RETRIEVE: + case FUSE_NOTIFY_STORE: + /* + * Unimplemented. I don't know of any file systems + * that use them, and the protocol isn't sound anyway, + * since the notification messages don't include the + * inode's generation number. Without that, it's + * possible to manipulate the cache of the wrong vnode. + * Finally, it's not defined what this message should + * do for a file with dirty cache. + */ + case FUSE_NOTIFY_POLL: + /* Unimplemented. See comments in fuse_vnops */ + default: + /* Not implemented */ + err = ENOSYS; + } } else { /* no callback at all! */ - SDT_PROBE2(fuse, , device, trace, 1, - "erhm, no handler for this response"); - err = EINVAL; + SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket, + ohead.unique); + if (ohead.error == -EAGAIN) { + /* + * This was probably a response to a FUSE_INTERRUPT + * operation whose original operation is already + * complete. We can't store FUSE_INTERRUPT tickets + * indefinitely because their responses are optional. + * So we delete them when the original operation + * completes. And sadly the fuse_header_out doesn't + * identify the opcode, so we have to guess. + */ + err = 0; + } else { + err = EINVAL; + } } return (err); @@ -445,7 +576,7 @@ { fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR, - S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP, "fuse"); + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, "fuse"); if (fuse_dev == NULL) return (ENOMEM); return (0); Index: sys/fs/fuse/fuse_file.h =================================================================== --- sys/fs/fuse/fuse_file.h +++ sys/fs/fuse/fuse_file.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -66,52 +71,115 @@ #include #include +/* + * The fufh type is the access mode of the fuse file handle. It's the portion + * of the open(2) flags related to permission. + */ typedef enum fufh_type { FUFH_INVALID = -1, - FUFH_RDONLY = 0, - FUFH_WRONLY = 1, - FUFH_RDWR = 2, - FUFH_MAXTYPE = 3, + FUFH_RDONLY = O_RDONLY, + FUFH_WRONLY = O_WRONLY, + FUFH_RDWR = O_RDWR, + FUFH_EXEC = O_EXEC, } fufh_type_t; -_Static_assert(FUFH_RDONLY == O_RDONLY, "RDONLY"); -_Static_assert(FUFH_WRONLY == O_WRONLY, "WRONLY"); -_Static_assert(FUFH_RDWR == O_RDWR, "RDWR"); +/* + * FUSE File Handles + * + * The FUSE protocol says that a server may assign a unique 64-bit file handle + * every time that a file is opened. Effectively, that's once for each file + * descriptor. + * + * Unfortunately, the VFS doesn't help us here. VOPs don't have a + * struct file* argument. fileops do, but many syscalls bypass the fileops + * layer and go straight to a vnode. Some, like writing from cache, can't + * track a file handle even in theory. The entire concept of the file handle + * is a product of FUSE's Linux origins; Linux lacks vnodes and almost every + * file system operation takes a struct file* argument. + * + * Since FreeBSD's VFS is more file descriptor-agnostic, we must store FUSE + * filehandles in the vnode. One option would be to only store a single file + * handle and never open FUSE files concurrently. That's what NetBSD does. + * But that violates FUSE's security model. FUSE expects the server to do all + * authorization (except when mounted with -o default_permissions). In order + * to do that, the server needs us to send FUSE_OPEN every time somebody opens + * a new file descriptor. + * + * Another option would be to never open FUSE files concurrently, but send a + * FUSE_ACCESS prior to every open after the first. That would give the server + * the opportunity to authorize the access. Unfortunately, the FUSE protocol + * makes ACCESS optional. File systems that don't implement it are assumed to + * authorize everything. A survey of 32 fuse file systems showed that only 14 + * implemented access. Among the laggards were a few that really ought to be + * doing server-side authorization. + * + * So we do something hacky, similar to what OpenBSD, Illumos, and OSXFuse do. + * we store a list of file handles, one for each combination of vnode, uid, + * gid, pid, and access mode. When opening a file, we first check whether + * there's already a matching file handle. If so, we reuse it. If not, we + * send FUSE_OPEN and create a new file handle. That minimizes the number of + * open file handles while still allowing the server to authorize stuff. + * + * VOPs that need a file handle search through the list for a close match. + * They can't be guaranteed of finding an exact match because, for example, a + * process may have changed its UID since opening the file. Also, most VOPs + * don't know exactly what permission they need. Is O_RDWR required or is + * O_RDONLY good enough? So the file handle we end up using may not be exactly + * the one we're supposed to use with that file descriptor. But if the FUSE + * file system isn't too picky, it will work. (FWIW even Linux sometimes + * guesses the file handle, during writes from cache or most SETATTR + * operations). + * + * I suspect this mess is part of the reason why neither NFS nor 9P have an + * equivalent of FUSE file handles. + */ struct fuse_filehandle { + LIST_ENTRY(fuse_filehandle) next; + + /* The filehandle returned by FUSE_OPEN */ uint64_t fh_id; - fufh_type_t fh_type; -}; -#define FUFH_IS_VALID(f) ((f)->fh_type != FUFH_INVALID) + /* + * flags returned by FUSE_OPEN + * Supported flags: FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE + * Unsupported: + * FOPEN_NONSEEKABLE: Adding support would require a new per-file + * or per-vnode attribute, which would have to be checked by + * kern_lseek (and others) for every file system. The benefit is + * dubious, since I'm unaware of any file systems in ports that use + * this flag. + */ + uint32_t fuse_open_flags; -static inline fufh_type_t -fuse_filehandle_xlate_from_mmap(int fflags) -{ - if (fflags & (PROT_READ | PROT_WRITE)) - return FUFH_RDWR; - else if (fflags & (PROT_WRITE)) - return FUFH_WRONLY; - else if ((fflags & PROT_READ) || (fflags & PROT_EXEC)) - return FUFH_RDONLY; - else - return FUFH_INVALID; -} + /* The access mode of the file handle */ + fufh_type_t fufh_type; -static inline fufh_type_t -fuse_filehandle_xlate_from_fflags(int fflags) -{ - if ((fflags & FREAD) && (fflags & FWRITE)) - return FUFH_RDWR; - else if (fflags & (FWRITE)) - return FUFH_WRONLY; - else if (fflags & (FREAD)) - return FUFH_RDONLY; - else - panic("FUSE: What kind of a flag is this (%x)?", fflags); -} + /* Credentials used to open the file */ + gid_t gid; + pid_t pid; + uid_t uid; +}; +#define FUFH_IS_VALID(f) ((f)->fufh_type != FUFH_INVALID) + +/* + * Get the flags to use for FUSE_CREATE, FUSE_OPEN and FUSE_RELEASE + * + * These are supposed to be the same as the flags argument to open(2). + * However, since we can't reliably associate a fuse_filehandle with a specific + * file descriptor it would would be dangerous to include anything more than + * the access mode flags. For example, suppose we open a file twice, once with + * O_APPEND and once without. Then the user pwrite(2)s to offset using the + * second file descriptor. If fusefs uses the first file handle, then the + * server may append the write to the end of the file rather than at offset 0. + * To prevent problems like this, we only ever send the portion of flags + * related to access mode. + * + * It's essential to send that portion, because FUSE uses it for server-side + * authorization. + */ static inline int -fuse_filehandle_xlate_to_oflags(fufh_type_t type) +fufh_type_2_fflags(fufh_type_t type) { int oflags = -1; @@ -119,6 +187,7 @@ case FUFH_RDONLY: case FUFH_WRONLY: case FUFH_RDWR: + case FUFH_EXEC: oflags = type; break; default: @@ -128,19 +197,28 @@ return oflags; } -int fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type); -fufh_type_t fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type); -int fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp); -int fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp); +bool fuse_filehandle_validrw(struct vnode *vp, int mode, + struct ucred *cred, pid_t pid); +int fuse_filehandle_get(struct vnode *vp, int fflag, + struct fuse_filehandle **fufhp, struct ucred *cred, + pid_t pid); +int fuse_filehandle_get_anyflags(struct vnode *vp, + struct fuse_filehandle **fufhp, struct ucred *cred, + pid_t pid); +int fuse_filehandle_getrw(struct vnode *vp, int fflag, + struct fuse_filehandle **fufhp, struct ucred *cred, + pid_t pid); void fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp, uint64_t fh_id); -int fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type, + struct fuse_filehandle **fufhp, struct thread *td, + struct ucred *cred, struct fuse_open_out *foo); +int fuse_filehandle_open(struct vnode *vp, int mode, struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred); -int fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type, +int fuse_filehandle_close(struct vnode *vp, struct fuse_filehandle *fufh, struct thread *td, struct ucred *cred); + +void fuse_file_init(void); +void fuse_file_destroy(void); #endif /* _FUSE_FILE_H_ */ Index: sys/fs/fuse/fuse_file.c =================================================================== --- sys/fs/fuse/fuse_file.c +++ sys/fs/fuse/fuse_file.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -59,8 +64,9 @@ __FBSDID("$FreeBSD$"); #include -#include #include +#include +#include #include #include #include @@ -79,52 +85,61 @@ #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" +#include "fuse_io.h" #include "fuse_ipc.h" #include "fuse_node.h" -SDT_PROVIDER_DECLARE(fuse); +MALLOC_DEFINE(M_FUSE_FILEHANDLE, "fuse_filefilehandle", "FUSE file handle"); + +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , file, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , file, trace, "int", "char*"); -static int fuse_fh_count = 0; +static counter_u64_t fuse_fh_count; -SYSCTL_INT(_vfs_fusefs, OID_AUTO, filehandle_count, CTLFLAG_RD, - &fuse_fh_count, 0, "number of open FUSE filehandles"); +SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, filehandle_count, CTLFLAG_RD, + &fuse_fh_count, "number of open FUSE filehandles"); +/* Get the FUFH type for a particular access mode */ +static inline fufh_type_t +fflags_2_fufh_type(int fflags) +{ + if ((fflags & FREAD) && (fflags & FWRITE)) + return FUFH_RDWR; + else if (fflags & (FWRITE)) + return FUFH_WRONLY; + else if (fflags & (FREAD)) + return FUFH_RDONLY; + else if (fflags & (FEXEC)) + return FUFH_EXEC; + else + panic("FUSE: What kind of a flag is this (%x)?", fflags); +} + int -fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type, +fuse_filehandle_open(struct vnode *vp, int a_mode, struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred) { struct fuse_dispatcher fdi; struct fuse_open_in *foi; struct fuse_open_out *foo; + fufh_type_t fufh_type; int err = 0; int oflags = 0; int op = FUSE_OPEN; - if (fuse_filehandle_valid(vp, fufh_type)) { - panic("FUSE: filehandle_open called despite valid fufh (type=%d)", - fufh_type); - /* NOTREACHED */ - } - /* - * Note that this means we are effectively FILTERING OUT open() flags. - */ - oflags = fuse_filehandle_xlate_to_oflags(fufh_type); + fufh_type = fflags_2_fufh_type(a_mode); + oflags = fufh_type_2_fflags(fufh_type); if (vnode_isdir(vp)) { op = FUSE_OPENDIR; - if (fufh_type != FUFH_RDONLY) { - SDT_PROBE2(fuse, , file, trace, 1, - "non-rdonly fh requested for a directory?"); - printf("FUSE:non-rdonly fh requested for a directory?\n"); - fufh_type = FUFH_RDONLY; - } + /* vn_open_vnode already rejects FWRITE on directories */ + MPASS(fufh_type == FUFH_RDONLY || fufh_type == FUFH_EXEC); } fdisp_init(&fdi, sizeof(*foi)); fdisp_make_vp(&fdi, op, vp, td, cred); @@ -133,7 +148,7 @@ foi->flags = oflags; if ((err = fdisp_wait_answ(&fdi))) { - SDT_PROBE2(fuse, , file, trace, 1, + SDT_PROBE2(fusefs, , file, trace, 1, "OUCH ... daemon didn't give fh"); if (err == ENOENT) { fuse_internal_vnode_disappear(vp); @@ -142,42 +157,24 @@ } foo = fdi.answ; - fuse_filehandle_init(vp, fufh_type, fufhp, foo->fh); + fuse_filehandle_init(vp, fufh_type, fufhp, td, cred, foo); + fuse_vnode_open(vp, foo->open_flags, td); - /* - * For WRONLY opens, force DIRECT_IO. This is necessary - * since writing a partial block through the buffer cache - * will result in a read of the block and that read won't - * be allowed by the WRONLY open. - */ - if (fufh_type == FUFH_WRONLY) - fuse_vnode_open(vp, foo->open_flags | FOPEN_DIRECT_IO, td); - else - fuse_vnode_open(vp, foo->open_flags, td); - out: fdisp_destroy(&fdi); return err; } int -fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type, +fuse_filehandle_close(struct vnode *vp, struct fuse_filehandle *fufh, struct thread *td, struct ucred *cred) { struct fuse_dispatcher fdi; struct fuse_release_in *fri; - struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct fuse_filehandle *fufh = NULL; int err = 0; int op = FUSE_RELEASE; - fufh = &(fvdat->fufh[fufh_type]); - if (!FUFH_IS_VALID(fufh)) { - panic("FUSE: filehandle_put called on invalid fufh (type=%d)", - fufh_type); - /* NOTREACHED */ - } if (fuse_isdeadfs(vp)) { goto out; } @@ -187,96 +184,193 @@ fdisp_make_vp(&fdi, op, vp, td, cred); fri = fdi.indata; fri->fh = fufh->fh_id; - fri->flags = fuse_filehandle_xlate_to_oflags(fufh_type); + fri->flags = fufh_type_2_fflags(fufh->fufh_type); + /* + * If the file has a POSIX lock then we're supposed to set lock_owner. + * If not, then lock_owner is undefined. So we may as well always set + * it. + */ + fri->lock_owner = td->td_proc->p_pid; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); out: - atomic_subtract_acq_int(&fuse_fh_count, 1); - fufh->fh_id = (uint64_t)-1; - fufh->fh_type = FUFH_INVALID; + counter_u64_add(fuse_fh_count, -1); + LIST_REMOVE(fufh, next); + free(fufh, M_FUSE_FILEHANDLE); return err; } -int -fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type) -{ - struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct fuse_filehandle *fufh; - - fufh = &(fvdat->fufh[fufh_type]); - return FUFH_IS_VALID(fufh); -} - /* * Check for a valid file handle, first the type requested, but if that * isn't valid, try for FUFH_RDWR. - * Return the FUFH type that is valid or FUFH_INVALID if there are none. - * This is a variant of fuse_filehandle_vaild() analogous to - * fuse_filehandle_getrw(). + * Return true if there is any file handle with the correct credentials and + * a fufh type that includes the provided one. + * A pid of 0 means "don't care" */ -fufh_type_t -fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type) +bool +fuse_filehandle_validrw(struct vnode *vp, int mode, + struct ucred *cred, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; + fufh_type_t fufh_type = fflags_2_fufh_type(mode); - fufh = &fvdat->fufh[fufh_type]; - if (FUFH_IS_VALID(fufh) != 0) - return (fufh_type); - fufh = &fvdat->fufh[FUFH_RDWR]; - if (FUFH_IS_VALID(fufh) != 0) - return (FUFH_RDWR); - return (FUFH_INVALID); + /* + * Unlike fuse_filehandle_get, we want to search for a filehandle with + * the exact cred, and no fallback + */ + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (fufh->fufh_type == fufh_type && + fufh->uid == cred->cr_uid && + fufh->gid == cred->cr_rgid && + (pid == 0 || fufh->pid == pid)) + return true; + } + + if (fufh_type == FUFH_EXEC) + return false; + + /* Fallback: find a RDWR list entry with the right cred */ + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (fufh->fufh_type == FUFH_RDWR && + fufh->uid == cred->cr_uid && + fufh->gid == cred->cr_rgid && + (pid == 0 || fufh->pid == pid)) + return true; + } + + return false; } int -fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp) +fuse_filehandle_get(struct vnode *vp, int fflag, + struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; + fufh_type_t fufh_type; - fufh = &(fvdat->fufh[fufh_type]); - if (!FUFH_IS_VALID(fufh)) + fufh_type = fflags_2_fufh_type(fflag); + /* cred can be NULL for in-kernel clients */ + if (cred == NULL) + goto fallback; + + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (fufh->fufh_type == fufh_type && + fufh->uid == cred->cr_uid && + fufh->gid == cred->cr_rgid && + (pid == 0 || fufh->pid == pid)) + goto found; + } + +fallback: + /* Fallback: find a list entry with the right flags */ + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (fufh->fufh_type == fufh_type) + break; + } + + if (fufh == NULL) return EBADF; + +found: if (fufhp != NULL) *fufhp = fufh; return 0; } +/* Get a file handle with any kind of flags */ int -fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp) +fuse_filehandle_get_anyflags(struct vnode *vp, + struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; - fufh = &(fvdat->fufh[fufh_type]); - if (!FUFH_IS_VALID(fufh)) { - fufh_type = FUFH_RDWR; + if (cred == NULL) + goto fallback; + + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (fufh->uid == cred->cr_uid && + fufh->gid == cred->cr_rgid && + (pid == 0 || fufh->pid == pid)) + goto found; } - return fuse_filehandle_get(vp, fufh_type, fufhp); + +fallback: + /* Fallback: find any list entry */ + fufh = LIST_FIRST(&fvdat->handles); + + if (fufh == NULL) + return EBADF; + +found: + if (fufhp != NULL) + *fufhp = fufh; + return 0; } +int +fuse_filehandle_getrw(struct vnode *vp, int fflag, + struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid) +{ + int err; + + err = fuse_filehandle_get(vp, fflag, fufhp, cred, pid); + if (err) + err = fuse_filehandle_get(vp, FREAD | FWRITE, fufhp, cred, pid); + return err; +} + void fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type, - struct fuse_filehandle **fufhp, uint64_t fh_id) + struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred, + struct fuse_open_out *foo) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; - fufh = &(fvdat->fufh[fufh_type]); - MPASS(!FUFH_IS_VALID(fufh)); - fufh->fh_id = fh_id; - fufh->fh_type = fufh_type; + fufh = malloc(sizeof(struct fuse_filehandle), M_FUSE_FILEHANDLE, + M_WAITOK); + MPASS(fufh != NULL); + fufh->fh_id = foo->fh; + fufh->fufh_type = fufh_type; + fufh->gid = cred->cr_rgid; + fufh->uid = cred->cr_uid; + fufh->pid = td->td_proc->p_pid; + fufh->fuse_open_flags = foo->open_flags; if (!FUFH_IS_VALID(fufh)) { panic("FUSE: init: invalid filehandle id (type=%d)", fufh_type); } + LIST_INSERT_HEAD(&fvdat->handles, fufh, next); if (fufhp != NULL) *fufhp = fufh; - atomic_add_acq_int(&fuse_fh_count, 1); + counter_u64_add(fuse_fh_count, 1); + + if (foo->open_flags & FOPEN_DIRECT_IO) { + ASSERT_VOP_ELOCKED(vp, __func__); + VTOFUD(vp)->flag |= FN_DIRECTIO; + fuse_io_invalbuf(vp, td); + } else { + if ((foo->open_flags & FOPEN_KEEP_CACHE) == 0) + fuse_io_invalbuf(vp, td); + VTOFUD(vp)->flag &= ~FN_DIRECTIO; + } + +} + +void +fuse_file_init(void) +{ + fuse_fh_count = counter_u64_alloc(M_WAITOK); +} + +void +fuse_file_destroy(void) +{ + counter_u64_free(fuse_fh_count); } Index: sys/fs/fuse/fuse_internal.h =================================================================== --- sys/fs/fuse/fuse_internal.h +++ sys/fs/fuse/fuse_internal.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -61,6 +66,8 @@ #define _FUSE_INTERNAL_H_ #include +#include +#include #include #include #include @@ -68,6 +75,9 @@ #include "fuse_ipc.h" #include "fuse_node.h" +extern counter_u64_t fuse_lookup_cache_hits; +extern counter_u64_t fuse_lookup_cache_misses; + static inline bool vfs_isrdonly(struct mount *mp) { @@ -80,12 +90,6 @@ return (vp->v_mount); } -static inline bool -vnode_mountedhere(struct vnode *vp) -{ - return (vp->v_mountedhere != NULL); -} - static inline enum vtype vnode_vtype(struct vnode *vp) { @@ -134,12 +138,6 @@ uio->uio_offset = offset; } -static inline void -uio_setresid(struct uio *uio, ssize_t resid) -{ - uio->uio_resid = resid; -} - /* miscellaneous */ static inline bool @@ -156,25 +154,57 @@ return (vp->v_mount->mnt_stat.f_iosize); } -/* access */ +/* + * Make a cacheable timeout in bintime format value based on a fuse_attr_out + * response + */ +static inline void +fuse_validity_2_bintime(uint64_t attr_valid, uint32_t attr_valid_nsec, + struct bintime *timeout) +{ + struct timespec now, duration, timeout_ts; -#define FVP_ACCESS_NOOP 0x01 + getnanouptime(&now); + /* "+ 2" is the bound of attr_valid_nsec + now.tv_nsec */ + /* Why oh why isn't there a TIME_MAX defined? */ + if (attr_valid >= INT_MAX || attr_valid + now.tv_sec + 2 >= INT_MAX) { + timeout->sec = INT_MAX; + } else { + duration.tv_sec = attr_valid; + duration.tv_nsec = attr_valid_nsec; + timespecadd(&duration, &now, &timeout_ts); + timespec2bintime(&timeout_ts, timeout); + } +} -#define FACCESS_VA_VALID 0x01 -#define FACCESS_DO_ACCESS 0x02 -#define FACCESS_STICKY 0x04 -#define FACCESS_CHOWN 0x08 -#define FACCESS_NOCHECKSPY 0x10 -#define FACCESS_SETGID 0x12 +/* + * Make a cacheable timeout value in timespec format based on the fuse_entry_out + * response + */ +static inline void +fuse_validity_2_timespec(const struct fuse_entry_out *feo, + struct timespec *timeout) +{ + struct timespec duration, now; -#define FACCESS_XQUERIES (FACCESS_STICKY | FACCESS_CHOWN | FACCESS_SETGID) + getnanouptime(&now); + /* "+ 2" is the bound of entry_valid_nsec + now.tv_nsec */ + if (feo->entry_valid >= INT_MAX || + feo->entry_valid + now.tv_sec + 2 >= INT_MAX) { + timeout->tv_sec = INT_MAX; + } else { + duration.tv_sec = feo->entry_valid; + duration.tv_nsec = feo->entry_valid_nsec; + timespecadd(&duration, &now, timeout); + } +} -struct fuse_access_param { - uid_t xuid; - gid_t xgid; - uint32_t facc_flags; -}; +/* VFS ops */ +int +fuse_internal_get_cached_vnode(struct mount*, ino_t, int, struct vnode**); + +/* access */ static inline int fuse_match_cred(struct ucred *basecred, struct ucred *usercred) { @@ -189,8 +219,8 @@ return (EPERM); } -int fuse_internal_access(struct vnode *vp, mode_t mode, - struct fuse_access_param *facp, struct thread *td, struct ucred *cred); +int fuse_internal_access(struct vnode *vp, accmode_t mode, + struct thread *td, struct ucred *cred); /* attributes */ void fuse_internal_cache_attrs(struct vnode *vp, struct fuse_attr *attr, @@ -198,21 +228,35 @@ /* fsync */ -int fuse_internal_fsync(struct vnode *vp, struct thread *td, - struct ucred *cred, struct fuse_filehandle *fufh); +int fuse_internal_fsync(struct vnode *vp, struct thread *td, int waitfor, + bool datasync); int fuse_internal_fsync_callback(struct fuse_ticket *tick, struct uio *uio); -/* readdir */ +/* getattr */ +int fuse_internal_do_getattr(struct vnode *vp, struct vattr *vap, + struct ucred *cred, struct thread *td); +int fuse_internal_getattr(struct vnode *vp, struct vattr *vap, + struct ucred *cred, struct thread *td); +/* asynchronous invalidation */ +int fuse_internal_invalidate_entry(struct mount *mp, struct uio *uio); +int fuse_internal_invalidate_inode(struct mount *mp, struct uio *uio); + +/* mknod */ +int fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp, struct vattr *vap); + +/* readdir */ struct pseudo_dirent { uint32_t d_namlen; }; +int fuse_internal_readdir(struct vnode *vp, struct uio *uio, off_t startoff, + struct fuse_filehandle *fufh, struct fuse_iov *cookediov, int *ncookies, + u_long *cookies); +int fuse_internal_readdir_processdata(struct uio *uio, off_t startoff, + int *fnd_start, size_t reqsize, void *buf, size_t bufsize, + struct fuse_iov *cookediov, int *ncookies, u_long **cookiesp); -int fuse_internal_readdir(struct vnode *vp, struct uio *uio, - struct fuse_filehandle *fufh, struct fuse_iov *cookediov); -int fuse_internal_readdir_processdata(struct uio *uio, size_t reqsize, - void *buf, size_t bufsize, void *param); - /* remove */ int fuse_internal_remove(struct vnode *dvp, struct vnode *vp, @@ -227,6 +271,10 @@ void fuse_internal_vnode_disappear(struct vnode *vp); +/* setattr */ +int fuse_internal_setattr(struct vnode *vp, struct vattr *va, + struct thread *td, struct ucred *cred); + /* strategy */ /* entity creation */ @@ -270,5 +318,9 @@ int fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio); void fuse_internal_send_init(struct fuse_data *data, struct thread *td); + +/* module load/unload */ +void fuse_internal_init(void); +void fuse_internal_destroy(void); #endif /* _FUSE_INTERNAL_H_ */ Index: sys/fs/fuse/fuse_internal.c =================================================================== --- sys/fs/fuse/fuse_internal.c +++ sys/fs/fuse/fuse_internal.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -59,8 +64,9 @@ __FBSDID("$FreeBSD$"); #include -#include #include +#include +#include #include #include #include @@ -89,35 +95,78 @@ #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" +#include "fuse_io.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_file.h" -#include "fuse_param.h" -SDT_PROVIDER_DECLARE(fuse); +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , internal, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , internal, trace, "int", "char*"); #ifdef ZERO_PAD_INCOMPLETE_BUFS static int isbzero(void *buf, size_t len); #endif -/* access */ +counter_u64_t fuse_lookup_cache_hits; +counter_u64_t fuse_lookup_cache_misses; +SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, lookup_cache_hits, CTLFLAG_RD, + &fuse_lookup_cache_hits, "number of positive cache hits in lookup"); + +SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, lookup_cache_misses, CTLFLAG_RD, + &fuse_lookup_cache_misses, "number of cache misses in lookup"); + int +fuse_internal_get_cached_vnode(struct mount* mp, ino_t ino, int flags, + struct vnode **vpp) +{ + struct bintime now; + struct thread *td = curthread; + uint64_t nodeid = ino; + int error; + + *vpp = NULL; + + error = vfs_hash_get(mp, fuse_vnode_hash(nodeid), flags, td, vpp, + fuse_vnode_cmp, &nodeid); + if (error) + return error; + /* + * Check the entry cache timeout. We have to do this within fusefs + * instead of by using cache_enter_time/cache_lookup because those + * routines are only intended to work with pathnames, not inodes + */ + if (*vpp != NULL) { + getbinuptime(&now); + if (bintime_cmp(&(VTOFUD(*vpp)->entry_cache_timeout), &now, >)){ + counter_u64_add(fuse_lookup_cache_hits, 1); + return 0; + } else { + /* Entry cache timeout */ + counter_u64_add(fuse_lookup_cache_misses, 1); + cache_purge(*vpp); + vput(*vpp); + *vpp = NULL; + } + } + return 0; +} + +/* Synchronously send a FUSE_ACCESS operation */ +int fuse_internal_access(struct vnode *vp, - mode_t mode, - struct fuse_access_param *facp, + accmode_t mode, struct thread *td, struct ucred *cred) { int err = 0; - uint32_t mask = 0; + uint32_t mask = F_OK; int dataflags; int vtype; struct mount *mp; @@ -125,77 +174,57 @@ struct fuse_access_in *fai; struct fuse_data *data; - /* NOT YET DONE */ - /* - * If this vnop gives you trouble, just return 0 here for a lazy - * kludge. - */ - /* return 0;*/ - mp = vnode_mount(vp); vtype = vnode_vtype(vp); data = fuse_get_mpdata(mp); dataflags = data->dataflags; - if ((mode & VWRITE) && vfs_isrdonly(mp)) { - return EACCES; - } - /* Unless explicitly permitted, deny everyone except the fs owner. */ - if (vnode_isvroot(vp) && !(facp->facc_flags & FACCESS_NOCHECKSPY)) { - if (!(dataflags & FSESS_DAEMON_CAN_SPY)) { - int denied = fuse_match_cred(data->daemoncred, - cred); + if (mode == 0) + return 0; - if (denied) { - return EPERM; - } + if (mode & VMODIFY_PERMS && vfs_isrdonly(mp)) { + switch (vp->v_type) { + case VDIR: + /* FALLTHROUGH */ + case VLNK: + /* FALLTHROUGH */ + case VREG: + return EROFS; + default: + break; } - facp->facc_flags |= FACCESS_NOCHECKSPY; } - if (!(facp->facc_flags & FACCESS_DO_ACCESS)) { - return 0; + + /* Unless explicitly permitted, deny everyone except the fs owner. */ + if (!(dataflags & FSESS_DAEMON_CAN_SPY)) { + if (fuse_match_cred(data->daemoncred, cred)) + return EPERM; } - if (((vtype == VREG) && (mode & VEXEC))) { -#ifdef NEED_MOUNT_ARGUMENT_FOR_THIS - /* Let the kernel handle this through open / close heuristics.*/ - return ENOTSUP; -#else - /* Let the kernel handle this. */ - return 0; -#endif - } - if (!fsess_isimpl(mp, FUSE_ACCESS)) { - /* Let the kernel handle this. */ - return 0; - } + if (dataflags & FSESS_DEFAULT_PERMISSIONS) { - /* Let the kernel handle this. */ - return 0; + struct vattr va; + + fuse_internal_getattr(vp, &va, cred, td); + return vaccess(vp->v_type, va.va_mode, va.va_uid, + va.va_gid, mode, cred, NULL); } - if ((mode & VADMIN) != 0) { - err = priv_check_cred(cred, PRIV_VFS_ADMIN); - if (err) { - return err; - } - } - if ((mode & (VWRITE | VAPPEND | VADMIN)) != 0) { + + if (!fsess_isimpl(mp, FUSE_ACCESS)) + return 0; + + if ((mode & (VWRITE | VAPPEND | VADMIN)) != 0) mask |= W_OK; - } - if ((mode & VREAD) != 0) { + if ((mode & VREAD) != 0) mask |= R_OK; - } - if ((mode & VEXEC) != 0) { + if ((mode & VEXEC) != 0) mask |= X_OK; - } - bzero(&fdi, sizeof(fdi)); fdisp_init(&fdi, sizeof(*fai)); fdisp_make_vp(&fdi, FUSE_ACCESS, vp, td, cred); fai = fdi.indata; - fai->mask = F_OK; - fai->mask |= mask; + fai->mask = mask; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); @@ -208,9 +237,9 @@ } /* - * Cache FUSE attributes from feo, in attr cache associated with vnode 'vp'. - * Optionally, if argument 'vap' is not NULL, store a copy of the converted - * attributes there as well. + * Cache FUSE attributes from attr, in attribute cache associated with vnode + * 'vp'. Optionally, if argument 'vap' is not NULL, store a copy of the + * converted attributes there as well. * * If the nominal attribute cache TTL is zero, do not cache on the 'vp' (but do * return the result to the caller). @@ -221,49 +250,57 @@ { struct mount *mp; struct fuse_vnode_data *fvdat; + struct fuse_data *data; struct vattr *vp_cache_at; mp = vnode_mount(vp); fvdat = VTOFUD(vp); + data = fuse_get_mpdata(mp); - /* Honor explicit do-not-cache requests from user filesystems. */ - if (attr_valid == 0 && attr_valid_nsec == 0) - fvdat->valid_attr_cache = false; - else - fvdat->valid_attr_cache = true; + ASSERT_VOP_ELOCKED(vp, "fuse_internal_cache_attrs"); - vp_cache_at = VTOVA(vp); + fuse_validity_2_bintime(attr_valid, attr_valid_nsec, + &fvdat->attr_cache_timeout); - if (vap == NULL && vp_cache_at == NULL) + /* Fix our buffers if the filesize changed without us knowing */ + if (vnode_isreg(vp) && attr->size != fvdat->cached_attrs.va_size) { + (void)fuse_vnode_setsize(vp, attr->size); + fvdat->cached_attrs.va_size = attr->size; + } + + if (attr_valid > 0 || attr_valid_nsec > 0) + vp_cache_at = &(fvdat->cached_attrs); + else if (vap != NULL) + vp_cache_at = vap; + else return; - if (vap == NULL) - vap = vp_cache_at; - - vattr_null(vap); - - vap->va_fsid = mp->mnt_stat.f_fsid.val[0]; - vap->va_fileid = attr->ino; - vap->va_mode = attr->mode & ~S_IFMT; - vap->va_nlink = attr->nlink; - vap->va_uid = attr->uid; - vap->va_gid = attr->gid; - vap->va_rdev = attr->rdev; - vap->va_size = attr->size; + vattr_null(vp_cache_at); + vp_cache_at->va_fsid = mp->mnt_stat.f_fsid.val[0]; + vp_cache_at->va_fileid = attr->ino; + vp_cache_at->va_mode = attr->mode & ~S_IFMT; + vp_cache_at->va_nlink = attr->nlink; + vp_cache_at->va_uid = attr->uid; + vp_cache_at->va_gid = attr->gid; + vp_cache_at->va_rdev = attr->rdev; + vp_cache_at->va_size = attr->size; /* XXX on i386, seconds are truncated to 32 bits */ - vap->va_atime.tv_sec = attr->atime; - vap->va_atime.tv_nsec = attr->atimensec; - vap->va_mtime.tv_sec = attr->mtime; - vap->va_mtime.tv_nsec = attr->mtimensec; - vap->va_ctime.tv_sec = attr->ctime; - vap->va_ctime.tv_nsec = attr->ctimensec; - vap->va_blocksize = PAGE_SIZE; - vap->va_type = IFTOVT(attr->mode); - vap->va_bytes = attr->blocks * S_BLKSIZE; - vap->va_flags = 0; + vp_cache_at->va_atime.tv_sec = attr->atime; + vp_cache_at->va_atime.tv_nsec = attr->atimensec; + vp_cache_at->va_mtime.tv_sec = attr->mtime; + vp_cache_at->va_mtime.tv_nsec = attr->mtimensec; + vp_cache_at->va_ctime.tv_sec = attr->ctime; + vp_cache_at->va_ctime.tv_nsec = attr->ctimensec; + if (fuse_libabi_geq(data, 7, 9) && attr->blksize > 0) + vp_cache_at->va_blocksize = attr->blksize; + else + vp_cache_at->va_blocksize = PAGE_SIZE; + vp_cache_at->va_type = IFTOVT(attr->mode); + vp_cache_at->va_bytes = attr->blocks * S_BLKSIZE; + vp_cache_at->va_flags = 0; - if (vap != vp_cache_at && vp_cache_at != NULL) - memcpy(vp_cache_at, vap, sizeof(*vap)); + if (vap != vp_cache_at && vap != NULL) + memcpy(vap, vp_cache_at, sizeof(*vap)); } @@ -281,47 +318,195 @@ int fuse_internal_fsync(struct vnode *vp, struct thread *td, - struct ucred *cred, - struct fuse_filehandle *fufh) + int waitfor, + bool datasync) { - int op = FUSE_FSYNC; - struct fuse_fsync_in *ffsi; + struct fuse_fsync_in *ffsi = NULL; struct fuse_dispatcher fdi; + struct fuse_filehandle *fufh; + struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct mount *mp = vnode_mount(vp); + int op = FUSE_FSYNC; + int err = 0; - if (vnode_isdir(vp)) { - op = FUSE_FSYNCDIR; + if (!fsess_isimpl(vnode_mount(vp), + (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) { + return 0; } - fdisp_init(&fdi, sizeof(*ffsi)); - fdisp_make_vp(&fdi, op, vp, td, cred); - ffsi = fdi.indata; - ffsi->fh = fufh->fh_id; + if (vnode_isdir(vp)) + op = FUSE_FSYNCDIR; - ffsi->fsync_flags = 1; /* datasync */ + if (!fsess_isimpl(mp, op)) + return 0; - fuse_insert_callback(fdi.tick, fuse_internal_fsync_callback); - fuse_insert_message(fdi.tick); + fdisp_init(&fdi, sizeof(*ffsi)); + /* + * fsync every open file handle for this file, because we can't be sure + * which file handle the caller is really referring to. + */ + LIST_FOREACH(fufh, &fvdat->handles, next) { + if (ffsi == NULL) + fdisp_make_vp(&fdi, op, vp, td, NULL); + else + fdisp_refresh_vp(&fdi, op, vp, td, NULL); + ffsi = fdi.indata; + ffsi->fh = fufh->fh_id; + ffsi->fsync_flags = 0; + if (datasync) + ffsi->fsync_flags = 1; + + if (waitfor == MNT_WAIT) { + err = fdisp_wait_answ(&fdi); + } else { + fuse_insert_callback(fdi.tick, + fuse_internal_fsync_callback); + fuse_insert_message(fdi.tick, false); + } + if (err == ENOSYS) { + /* ENOSYS means "success, and don't call again" */ + fsess_set_notimpl(mp, op); + err = 0; + break; + } + } fdisp_destroy(&fdi); - return 0; + return err; +} +/* Asynchronous invalidation */ +SDT_PROBE_DEFINE2(fusefs, , internal, invalidate_cache_hit, + "struct vnode*", "struct vnode*"); +int +fuse_internal_invalidate_entry(struct mount *mp, struct uio *uio) +{ + struct fuse_notify_inval_entry_out fnieo; + struct componentname cn; + struct vnode *dvp, *vp; + char name[PATH_MAX]; + int err; + + if ((err = uiomove(&fnieo, sizeof(fnieo), uio)) != 0) + return (err); + + if ((err = uiomove(name, fnieo.namelen, uio)) != 0) + return (err); + name[fnieo.namelen] = '\0'; + /* fusefs does not cache "." or ".." entries */ + if (strncmp(name, ".", sizeof(".")) == 0 || + strncmp(name, "..", sizeof("..")) == 0) + return (0); + + if (fnieo.parent == FUSE_ROOT_ID) + err = VFS_ROOT(mp, LK_SHARED, &dvp); + else + err = fuse_internal_get_cached_vnode( mp, fnieo.parent, + LK_SHARED, &dvp); + /* + * If dvp is not in the cache, then it must've been reclaimed. And + * since fuse_vnop_reclaim does a cache_purge, name's entry must've + * been invalidated already. So we can safely return if dvp == NULL + */ + if (err != 0 || dvp == NULL) + return (err); + /* + * XXX we can't check dvp's generation because the FUSE invalidate + * entry message doesn't include it. Worse case is that we invalidate + * an entry that didn't need to be invalidated. + */ + + cn.cn_nameiop = LOOKUP; + cn.cn_flags = 0; /* !MAKEENTRY means free cached entry */ + cn.cn_thread = curthread; + cn.cn_cred = curthread->td_ucred; + cn.cn_lkflags = LK_SHARED; + cn.cn_pnbuf = NULL; + cn.cn_nameptr = name; + cn.cn_namelen = fnieo.namelen; + err = cache_lookup(dvp, &vp, &cn, NULL, NULL); + MPASS(err == 0); + fuse_vnode_clear_attr_cache(dvp); + vput(dvp); + return (0); } +int +fuse_internal_invalidate_inode(struct mount *mp, struct uio *uio) +{ + struct fuse_notify_inval_inode_out fniio; + struct vnode *vp; + int err; + + if ((err = uiomove(&fniio, sizeof(fniio), uio)) != 0) + return (err); + + if (fniio.ino == FUSE_ROOT_ID) + err = VFS_ROOT(mp, LK_EXCLUSIVE, &vp); + else + err = fuse_internal_get_cached_vnode(mp, fniio.ino, LK_SHARED, + &vp); + if (err != 0 || vp == NULL) + return (err); + /* + * XXX we can't check vp's generation because the FUSE invalidate + * entry message doesn't include it. Worse case is that we invalidate + * an inode that didn't need to be invalidated. + */ + + /* + * Flush and invalidate buffers if off >= 0. Technically we only need + * to flush and invalidate the range of offsets [off, off + len), but + * for simplicity's sake we do everything. + */ + if (fniio.off >= 0) + fuse_io_invalbuf(vp, curthread); + fuse_vnode_clear_attr_cache(vp); + vput(vp); + return (0); +} + +/* mknod */ +int +fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp, struct vattr *vap) +{ + struct fuse_data *data; + struct fuse_mknod_in fmni; + size_t insize; + + data = fuse_get_mpdata(dvp->v_mount); + + fmni.mode = MAKEIMODE(vap->va_type, vap->va_mode); + fmni.rdev = vap->va_rdev; + if (fuse_libabi_geq(data, 7, 12)) { + insize = sizeof(fmni); + fmni.umask = curthread->td_proc->p_fd->fd_cmask; + } else { + insize = FUSE_COMPAT_MKNOD_IN_SIZE; + } + return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKNOD, &fmni, + insize, vap->va_type)); +} + /* readdir */ int fuse_internal_readdir(struct vnode *vp, struct uio *uio, + off_t startoff, struct fuse_filehandle *fufh, - struct fuse_iov *cookediov) + struct fuse_iov *cookediov, + int *ncookies, + u_long *cookies) { int err = 0; struct fuse_dispatcher fdi; - struct fuse_read_in *fri; + struct fuse_read_in *fri = NULL; + int fnd_start; - if (uio_resid(uio) == 0) { + if (uio_resid(uio) == 0) return 0; - } fdisp_init(&fdi, 0); /* @@ -329,51 +514,70 @@ * I/O). */ + /* + * fnd_start is set non-zero once the offset in the directory gets + * to the startoff. This is done because directories must be read + * from the beginning (offset == 0) when fuse_vnop_readdir() needs + * to do an open of the directory. + * If it is not set non-zero here, it will be set non-zero in + * fuse_internal_readdir_processdata() when uio_offset == startoff. + */ + fnd_start = 0; + if (uio->uio_offset == startoff) + fnd_start = 1; while (uio_resid(uio) > 0) { - fdi.iosize = sizeof(*fri); - fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL); + if (fri == NULL) + fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL); + else + fdisp_refresh_vp(&fdi, FUSE_READDIR, vp, NULL, NULL); fri = fdi.indata; fri->fh = fufh->fh_id; fri->offset = uio_offset(uio); - fri->size = min(uio_resid(uio), FUSE_DEFAULT_IOSIZE); - /* mp->max_read */ + fri->size = MIN(uio->uio_resid, + fuse_get_mpdata(vp->v_mount)->max_read); - if ((err = fdisp_wait_answ(&fdi))) { + if ((err = fdisp_wait_answ(&fdi))) break; - } - if ((err = fuse_internal_readdir_processdata(uio, fri->size, fdi.answ, - fdi.iosize, cookediov))) { + if ((err = fuse_internal_readdir_processdata(uio, startoff, + &fnd_start, fri->size, fdi.answ, fdi.iosize, cookediov, + ncookies, &cookies))) break; - } } fdisp_destroy(&fdi); return ((err == -1) ? 0 : err); } +/* + * Return -1 to indicate that this readdir is finished, 0 if it copied + * all the directory data read in and it may be possible to read more + * and greater than 0 for a failure. + */ int fuse_internal_readdir_processdata(struct uio *uio, + off_t startoff, + int *fnd_start, size_t reqsize, void *buf, size_t bufsize, - void *param) + struct fuse_iov *cookediov, + int *ncookies, + u_long **cookiesp) { int err = 0; - int cou = 0; int bytesavail; size_t freclen; struct dirent *de; struct fuse_dirent *fudge; - struct fuse_iov *cookediov = param; + u_long *cookies; - if (bufsize < FUSE_NAME_OFFSET) { + cookies = *cookiesp; + if (bufsize < FUSE_NAME_OFFSET) return -1; - } for (;;) { - if (bufsize < FUSE_NAME_OFFSET) { err = -1; break; @@ -381,10 +585,12 @@ fudge = (struct fuse_dirent *)buf; freclen = FUSE_DIRENT_SIZE(fudge); - cou++; - if (bufsize < freclen) { - err = ((cou == 1) ? -1 : 0); + /* + * This indicates a partial directory entry at the + * end of the directory data. + */ + err = -1; break; } #ifdef ZERO_PAD_INCOMPLETE_BUFS @@ -402,30 +608,47 @@ &fudge->namelen); if (bytesavail > uio_resid(uio)) { + /* Out of space for the dir so we are done. */ err = -1; break; } - fiov_refresh(cookediov); - fiov_adjust(cookediov, bytesavail); + /* + * Don't start to copy the directory entries out until + * the requested offset in the directory is found. + */ + if (*fnd_start != 0) { + fiov_adjust(cookediov, bytesavail); + bzero(cookediov->base, bytesavail); - de = (struct dirent *)cookediov->base; - de->d_fileno = fudge->ino; - de->d_reclen = bytesavail; - de->d_type = fudge->type; - de->d_namlen = fudge->namelen; - memcpy((char *)cookediov->base + sizeof(struct dirent) - - MAXNAMLEN - 1, - (char *)buf + FUSE_NAME_OFFSET, fudge->namelen); - dirent_terminate(de); + de = (struct dirent *)cookediov->base; + de->d_fileno = fudge->ino; + de->d_reclen = bytesavail; + de->d_type = fudge->type; + de->d_namlen = fudge->namelen; + memcpy((char *)cookediov->base + sizeof(struct dirent) - + MAXNAMLEN - 1, + (char *)buf + FUSE_NAME_OFFSET, fudge->namelen); + dirent_terminate(de); - err = uiomove(cookediov->base, cookediov->len, uio); - if (err) { - break; - } + err = uiomove(cookediov->base, cookediov->len, uio); + if (err) + break; + if (cookies != NULL) { + if (*ncookies == 0) { + err = -1; + break; + } + *cookies = fudge->off; + cookies++; + (*ncookies)--; + } + } else if (startoff == fudge->off) + *fnd_start = 1; buf = (char *)buf + freclen; bufsize -= freclen; uio_setoffset(uio, fudge->off); } + *cookiesp = cookies; return err; } @@ -439,12 +662,9 @@ enum fuse_opcode op) { struct fuse_dispatcher fdi; - struct fuse_vnode_data *fvdat; - int err; + nlink_t nlink; + int err = 0; - err = 0; - fvdat = VTOFUD(vp); - fdisp_init(&fdi, cnp->cn_namelen + 1); fdisp_make_vp(&fdi, op, dvp, cnp->cn_thread, cnp->cn_cred); @@ -453,6 +673,35 @@ err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); + + if (err) + return (err); + + /* + * Access the cached nlink even if the attr cached has expired. If + * it's inaccurate, the worst that will happen is: + * 1) We'll recycle the vnode even though the file has another link we + * don't know about, costing a bit of cpu time, or + * 2) We won't recycle the vnode even though all of its links are gone. + * It will linger around until vnlru reclaims it, costing a bit of + * temporary memory. + */ + nlink = VTOFUD(vp)->cached_attrs.va_nlink--; + + /* + * Purge the parent's attribute cache because the daemon + * should've updated its mtime and ctime. + */ + fuse_vnode_clear_attr_cache(dvp); + + /* NB: nlink could be zero if it was never cached */ + if (nlink <= 1 || vnode_vtype(vp) == VDIR) { + fuse_internal_vnode_disappear(vp); + } else { + cache_purge(vp); + fuse_vnode_update(vp, FN_CTIMECHANGE); + } + return err; } @@ -532,6 +781,13 @@ feo->nodeid, 1); return err; } + + /* + * Purge the parent's attribute cache because the daemon should've + * updated its mtime and ctime + */ + fuse_vnode_clear_attr_cache(dvp); + fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); @@ -593,10 +849,79 @@ ffi = fdi.indata; ffi->nlookup = nlookup; - fuse_insert_message(fdi.tick); + fuse_insert_message(fdi.tick, false); fdisp_destroy(&fdi); } +/* Fetch the vnode's attributes from the daemon*/ +int +fuse_internal_do_getattr(struct vnode *vp, struct vattr *vap, + struct ucred *cred, struct thread *td) +{ + struct fuse_dispatcher fdi; + struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_getattr_in *fgai; + struct fuse_attr_out *fao; + off_t old_filesize = fvdat->cached_attrs.va_size; + struct timespec old_ctime = fvdat->cached_attrs.va_ctime; + struct timespec old_mtime = fvdat->cached_attrs.va_mtime; + enum vtype vtyp; + int err; + + fdisp_init(&fdi, 0); + fdisp_make_vp(&fdi, FUSE_GETATTR, vp, td, cred); + fgai = fdi.indata; + /* + * We could look up a file handle and set it in fgai->fh, but that + * involves extra runtime work and I'm unaware of any file systems that + * care. + */ + fgai->getattr_flags = 0; + if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) { + if (err == ENOENT) + fuse_internal_vnode_disappear(vp); + goto out; + } + + fao = (struct fuse_attr_out *)fdi.answ; + vtyp = IFTOVT(fao->attr.mode); + if (fvdat->flag & FN_SIZECHANGE) + fao->attr.size = old_filesize; + if (fvdat->flag & FN_CTIMECHANGE) { + fao->attr.ctime = old_ctime.tv_sec; + fao->attr.ctimensec = old_ctime.tv_nsec; + } + if (fvdat->flag & FN_MTIMECHANGE) { + fao->attr.mtime = old_mtime.tv_sec; + fao->attr.mtimensec = old_mtime.tv_nsec; + } + fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, + fao->attr_valid_nsec, vap); + if (vtyp != vnode_vtype(vp)) { + fuse_internal_vnode_disappear(vp); + err = ENOENT; + } + +out: + fdisp_destroy(&fdi); + return err; +} + +/* Read a vnode's attributes from cache or fetch them from the fuse daemon */ +int +fuse_internal_getattr(struct vnode *vp, struct vattr *vap, struct ucred *cred, + struct thread *td) +{ + struct vattr *attrs; + + if ((attrs = VTOVA(vp)) != NULL) { + *vap = *attrs; /* struct copy */ + return 0; + } + + return fuse_internal_do_getattr(vp, vap, cred, td); +} + void fuse_internal_vnode_disappear(struct vnode *vp) { @@ -604,7 +929,6 @@ ASSERT_VOP_ELOCKED(vp, "fuse_internal_vnode_disappear"); fvdat->flag |= FN_REVOKED; - fvdat->valid_attr_cache = false; cache_purge(vp); } @@ -625,27 +949,69 @@ } fiio = fticket_resp(tick)->base; - /* XXX: Do we want to check anything further besides this? */ - if (fiio->major < 7) { - SDT_PROBE2(fuse, , internal, trace, 1, + data->fuse_libabi_major = fiio->major; + data->fuse_libabi_minor = fiio->minor; + if (!fuse_libabi_geq(data, 7, 4)) { + /* + * With a little work we could support servers as old as 7.1. + * But there would be little payoff. + */ + SDT_PROBE2(fusefs, , internal, trace, 1, "userpace version too low"); err = EPROTONOSUPPORT; goto out; } - data->fuse_libabi_major = fiio->major; - data->fuse_libabi_minor = fiio->minor; if (fuse_libabi_geq(data, 7, 5)) { - if (fticket_resp(tick)->len == sizeof(struct fuse_init_out)) { + if (fticket_resp(tick)->len == sizeof(struct fuse_init_out) || + fticket_resp(tick)->len == FUSE_COMPAT_22_INIT_OUT_SIZE) { data->max_write = fiio->max_write; + if (fiio->flags & FUSE_ASYNC_READ) + data->dataflags |= FSESS_ASYNC_READ; + if (fiio->flags & FUSE_POSIX_LOCKS) + data->dataflags |= FSESS_POSIX_LOCKS; + if (fiio->flags & FUSE_EXPORT_SUPPORT) + data->dataflags |= FSESS_EXPORT_SUPPORT; + /* + * Don't bother to check FUSE_BIG_WRITES, because it's + * redundant with max_write + */ + /* + * max_background and congestion_threshold are not + * implemented + */ } else { err = EINVAL; } } else { - /* Old fix values */ + /* Old fixed values */ data->max_write = 4096; } + if (fuse_libabi_geq(data, 7, 6)) + data->max_readahead_blocks = fiio->max_readahead / maxbcachebuf; + + if (!fuse_libabi_geq(data, 7, 7)) + fsess_set_notimpl(data->mp, FUSE_INTERRUPT); + + if (!fuse_libabi_geq(data, 7, 8)) { + fsess_set_notimpl(data->mp, FUSE_BMAP); + fsess_set_notimpl(data->mp, FUSE_DESTROY); + } + + if (fuse_libabi_geq(data, 7, 23) && fiio->time_gran >= 1 && + fiio->time_gran <= 1000000000) + data->time_gran = fiio->time_gran; + else + data->time_gran = 1; + + if (!fuse_libabi_geq(data, 7, 23)) + data->cache_mode = fuse_data_cache_mode; + else if (fiio->flags & FUSE_WRITEBACK_CACHE) + data->cache_mode = FUSE_CACHE_WB; + else + data->cache_mode = FUSE_CACHE_WT; + out: if (err) { fdata_set_dead(data); @@ -669,14 +1035,156 @@ fiii = fdi.indata; fiii->major = FUSE_KERNEL_VERSION; fiii->minor = FUSE_KERNEL_MINOR_VERSION; - fiii->max_readahead = FUSE_DEFAULT_IOSIZE * 16; - fiii->flags = 0; + /* + * fusefs currently reads ahead no more than one cache block at a time. + * See fuse_read_biobackend + */ + fiii->max_readahead = maxbcachebuf; + /* + * Unsupported features: + * FUSE_FILE_OPS: No known FUSE server or client supports it + * FUSE_ATOMIC_O_TRUNC: our VFS cannot support it + * FUSE_DONT_MASK: unlike Linux, FreeBSD always applies the umask, even + * when default ACLs are in use. + * FUSE_SPLICE_WRITE, FUSE_SPLICE_MOVE, FUSE_SPLICE_READ: FreeBSD + * doesn't have splice(2). + * FUSE_FLOCK_LOCKS: not yet implemented + * FUSE_HAS_IOCTL_DIR: not yet implemented + * FUSE_AUTO_INVAL_DATA: not yet implemented + * FUSE_DO_READDIRPLUS: not yet implemented + * FUSE_READDIRPLUS_AUTO: not yet implemented + * FUSE_ASYNC_DIO: not yet implemented + * FUSE_NO_OPEN_SUPPORT: not yet implemented + */ + fiii->flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_EXPORT_SUPPORT + | FUSE_BIG_WRITES | FUSE_WRITEBACK_CACHE; fuse_insert_callback(fdi.tick, fuse_internal_init_callback); - fuse_insert_message(fdi.tick); + fuse_insert_message(fdi.tick, false); fdisp_destroy(&fdi); } +/* + * Send a FUSE_SETATTR operation with no permissions checks. If cred is NULL, + * send the request with root credentials + */ +int fuse_internal_setattr(struct vnode *vp, struct vattr *vap, + struct thread *td, struct ucred *cred) +{ + struct fuse_vnode_data *fvdat; + struct fuse_dispatcher fdi; + struct fuse_setattr_in *fsai; + struct mount *mp; + pid_t pid = td->td_proc->p_pid; + struct fuse_data *data; + int dataflags; + int err = 0; + enum vtype vtyp; + int sizechanged = -1; + uint64_t newsize = 0; + + mp = vnode_mount(vp); + fvdat = VTOFUD(vp); + data = fuse_get_mpdata(mp); + dataflags = data->dataflags; + + fdisp_init(&fdi, sizeof(*fsai)); + fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred); + if (!cred) { + fdi.finh->uid = 0; + fdi.finh->gid = 0; + } + fsai = fdi.indata; + fsai->valid = 0; + + if (vap->va_uid != (uid_t)VNOVAL) { + fsai->uid = vap->va_uid; + fsai->valid |= FATTR_UID; + } + if (vap->va_gid != (gid_t)VNOVAL) { + fsai->gid = vap->va_gid; + fsai->valid |= FATTR_GID; + } + if (vap->va_size != VNOVAL) { + struct fuse_filehandle *fufh = NULL; + + /*Truncate to a new value. */ + fsai->size = vap->va_size; + sizechanged = 1; + newsize = vap->va_size; + fsai->valid |= FATTR_SIZE; + + fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); + if (fufh) { + fsai->fh = fufh->fh_id; + fsai->valid |= FATTR_FH; + } + VTOFUD(vp)->flag &= ~FN_SIZECHANGE; + } + if (vap->va_atime.tv_sec != VNOVAL) { + fsai->atime = vap->va_atime.tv_sec; + fsai->atimensec = vap->va_atime.tv_nsec; + fsai->valid |= FATTR_ATIME; + if (vap->va_vaflags & VA_UTIMES_NULL) + fsai->valid |= FATTR_ATIME_NOW; + } + if (vap->va_mtime.tv_sec != VNOVAL) { + fsai->mtime = vap->va_mtime.tv_sec; + fsai->mtimensec = vap->va_mtime.tv_nsec; + fsai->valid |= FATTR_MTIME; + if (vap->va_vaflags & VA_UTIMES_NULL) + fsai->valid |= FATTR_MTIME_NOW; + } else if (fvdat->flag & FN_MTIMECHANGE) { + fsai->mtime = fvdat->cached_attrs.va_mtime.tv_sec; + fsai->mtimensec = fvdat->cached_attrs.va_mtime.tv_nsec; + fsai->valid |= FATTR_MTIME; + } + if (fuse_libabi_geq(data, 7, 23) && fvdat->flag & FN_CTIMECHANGE) { + fsai->ctime = fvdat->cached_attrs.va_ctime.tv_sec; + fsai->ctimensec = fvdat->cached_attrs.va_ctime.tv_nsec; + fsai->valid |= FATTR_CTIME; + } + if (vap->va_mode != (mode_t)VNOVAL) { + fsai->mode = vap->va_mode & ALLPERMS; + fsai->valid |= FATTR_MODE; + } + if (!fsai->valid) { + goto out; + } + + if ((err = fdisp_wait_answ(&fdi))) + goto out; + vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode); + + if (vnode_vtype(vp) != vtyp) { + if (vnode_vtype(vp) == VNON && vtyp != VNON) { + SDT_PROBE2(fusefs, , internal, trace, 1, "FUSE: Dang! " + "vnode_vtype is VNON and vtype isn't."); + } else { + /* + * STALE vnode, ditch + * + * The vnode has changed its type "behind our back". + * There's nothing really we can do, so let us just + * force an internal revocation and tell the caller to + * try again, if interested. + */ + fuse_internal_vnode_disappear(vp); + err = EAGAIN; + } + } + if (err == 0) { + struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ; + fuse_vnode_undirty_cached_timestamps(vp); + fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, + fao->attr_valid_nsec, NULL); + } + +out: + fdisp_destroy(&fdi); + return err; +} + #ifdef ZERO_PAD_INCOMPLETE_BUFS static int isbzero(void *buf, size_t len) @@ -692,3 +1200,17 @@ } #endif + +void +fuse_internal_init(void) +{ + fuse_lookup_cache_misses = counter_u64_alloc(M_WAITOK); + fuse_lookup_cache_hits = counter_u64_alloc(M_WAITOK); +} + +void +fuse_internal_destroy(void) +{ + counter_u64_free(fuse_lookup_cache_hits); + counter_u64_free(fuse_lookup_cache_misses); +} Index: sys/fs/fuse/fuse_io.h =================================================================== --- sys/fs/fuse/fuse_io.h +++ sys/fs/fuse/fuse_io.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -61,7 +66,7 @@ #define _FUSE_IO_H_ int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, - struct ucred *cred); + struct ucred *cred, pid_t pid); int fuse_io_strategy(struct vnode *vp, struct buf *bp); int fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td); int fuse_io_invalbuf(struct vnode *vp, struct thread *td); Index: sys/fs/fuse/fuse_io.c =================================================================== --- sys/fs/fuse/fuse_io.c +++ sys/fs/fuse/fuse_io.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -72,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +89,7 @@ #include #include #include +#include #include #include @@ -98,45 +105,162 @@ #include "fuse_ipc.h" #include "fuse_io.h" -SDT_PROVIDER_DECLARE(fuse); /* + * Set in a struct buf to indicate that the write came from the buffer cache + * and the originating cred and pid are no longer known. + */ +#define B_FUSEFS_WRITE_CACHE B_FS_FLAG1 + +SDT_PROVIDER_DECLARE(fusefs); +/* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , io, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , io, trace, "int", "char*"); +static int +fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end); +static void +fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred, + struct thread *td); static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh); static int -fuse_read_biobackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh); +fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, + struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid); static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); + struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, + int ioflag, bool pages); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); + struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid); -SDT_PROBE_DEFINE5(fuse, , io, io_dispatch, "struct vnode*", "struct uio*", +/* Invalidate a range of cached data, whether dirty of not */ +static int +fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end) +{ + struct buf *bp; + daddr_t left_lbn, end_lbn, right_lbn; + off_t new_filesize; + int iosize, left_on, right_on, right_blksize; + + iosize = fuse_iosize(vp); + left_lbn = start / iosize; + end_lbn = howmany(end, iosize); + left_on = start & (iosize - 1); + if (left_on != 0) { + bp = getblk(vp, left_lbn, iosize, PCATCH, 0, 0); + if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyend >= left_on) { + /* + * Flush the dirty buffer, because we don't have a + * byte-granular way to record which parts of the + * buffer are valid. + */ + bwrite(bp); + if (bp->b_error) + return (bp->b_error); + } else { + brelse(bp); + } + } + right_on = end & (iosize - 1); + if (right_on != 0) { + right_lbn = end / iosize; + new_filesize = MAX(filesize, end); + right_blksize = MIN(iosize, new_filesize - iosize * right_lbn); + bp = getblk(vp, right_lbn, right_blksize, PCATCH, 0, 0); + if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyoff < right_on) { + /* + * Flush the dirty buffer, because we don't have a + * byte-granular way to record which parts of the + * buffer are valid. + */ + bwrite(bp); + if (bp->b_error) + return (bp->b_error); + } else { + brelse(bp); + } + } + + v_inval_buf_range(vp, left_lbn, end_lbn, iosize); + return (0); +} + +/* + * FreeBSD clears the SUID and SGID bits on any write by a non-root user. + */ +static void +fuse_io_clear_suid_on_write(struct vnode *vp, struct ucred *cred, + struct thread *td) +{ + struct fuse_data *data; + struct mount *mp; + struct vattr va; + int dataflags; + + mp = vnode_mount(vp); + data = fuse_get_mpdata(mp); + dataflags = data->dataflags; + + if (dataflags & FSESS_DEFAULT_PERMISSIONS) { + if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { + fuse_internal_getattr(vp, &va, cred, td); + if (va.va_mode & (S_ISUID | S_ISGID)) { + mode_t mode = va.va_mode & ~(S_ISUID | S_ISGID); + /* Clear all vattr fields except mode */ + vattr_null(&va); + va.va_mode = mode; + + /* + * Ignore fuse_internal_setattr's return value, + * because at this point the write operation has + * already succeeded and we don't want to return + * failing status for that. + */ + (void)fuse_internal_setattr(vp, &va, td, NULL); + } + } + } +} + +SDT_PROBE_DEFINE5(fusefs, , io, io_dispatch, "struct vnode*", "struct uio*", "int", "struct ucred*", "struct fuse_filehandle*"); +SDT_PROBE_DEFINE4(fusefs, , io, io_dispatch_filehandles_closed, "struct vnode*", + "struct uio*", "int", "struct ucred*"); int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, - struct ucred *cred) + struct ucred *cred, pid_t pid) { struct fuse_filehandle *fufh; int err, directio; + int fflag; + bool closefufh = false; MPASS(vp->v_type == VREG || vp->v_type == VDIR); - err = fuse_filehandle_getrw(vp, - (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); - if (err) { + fflag = (uio->uio_rw == UIO_READ) ? FREAD : FWRITE; + err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); + if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { + /* + * nfsd will do I/O without first doing VOP_OPEN. We + * must implicitly open the file here + */ + err = fuse_filehandle_open(vp, fflag, &fufh, curthread, cred); + closefufh = true; + } + else if (err) { + SDT_PROBE4(fusefs, , io, io_dispatch_filehandles_closed, + vp, uio, ioflag, cred); printf("FUSE: io dispatch: filehandles are closed\n"); return err; } - SDT_PROBE5(fuse, , io, io_dispatch, vp, uio, ioflag, cred, fufh); + if (err) + goto out; + SDT_PROBE5(fusefs, , io, io_dispatch, vp, uio, ioflag, cred, fufh); /* * Ideally, when the daemon asks for direct io at open time, the @@ -153,108 +277,137 @@ switch (uio->uio_rw) { case UIO_READ: if (directio) { - SDT_PROBE2(fuse, , io, trace, 1, + SDT_PROBE2(fusefs, , io, trace, 1, "direct read of vnode"); err = fuse_read_directbackend(vp, uio, cred, fufh); } else { - SDT_PROBE2(fuse, , io, trace, 1, + SDT_PROBE2(fusefs, , io, trace, 1, "buffered read of vnode"); - err = fuse_read_biobackend(vp, uio, cred, fufh); + err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh, + pid); } break; case UIO_WRITE: - /* - * Kludge: simulate write-through caching via write-around - * caching. Same effect, as far as never caching dirty data, - * but slightly pessimal in that newly written data is not - * cached. - */ - if (directio || fuse_data_cache_mode == FUSE_CACHE_WT) { - SDT_PROBE2(fuse, , io, trace, 1, + fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); + if (directio) { + off_t start, end, filesize; + + SDT_PROBE2(fusefs, , io, trace, 1, "direct write of vnode"); - err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag); + + err = fuse_vnode_size(vp, &filesize, cred, curthread); + if (err) + goto out; + + start = uio->uio_offset; + end = start + uio->uio_resid; + KASSERT((ioflag & (IO_VMIO | IO_DIRECT)) != + (IO_VMIO | IO_DIRECT), + ("IO_DIRECT used for a cache flush?")); + /* Invalidate the write cache when writing directly */ + err = fuse_inval_buf_range(vp, filesize, start, end); + if (err) + return (err); + err = fuse_write_directbackend(vp, uio, cred, fufh, + filesize, ioflag, false); } else { - SDT_PROBE2(fuse, , io, trace, 1, + SDT_PROBE2(fusefs, , io, trace, 1, "buffered write of vnode"); - err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag); + if (!fsess_opt_writeback(vnode_mount(vp))) + ioflag |= IO_SYNC; + err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag, + pid); } + fuse_io_clear_suid_on_write(vp, cred, uio->uio_td); break; default: panic("uninterpreted mode passed to fuse_io_dispatch"); } +out: + if (closefufh) + fuse_filehandle_close(vp, fufh, curthread, cred); + return (err); } -SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_start, "int", "int", "int"); -SDT_PROBE_DEFINE2(fuse, , io, read_bio_backend_feed, "int", "int"); -SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_end, "int", "ssize_t", "int"); +SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int"); +SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "struct buf*"); +SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int", + "struct buf*"); static int -fuse_read_biobackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh) +fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, + struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid) { struct buf *bp; - daddr_t lbn; - int bcount; - int err = 0, n = 0, on = 0; + struct mount *mp; + struct fuse_data *data; + daddr_t lbn, nextlbn; + int bcount, nextsize; + int err, n = 0, on = 0, seqcount; off_t filesize; const int biosize = fuse_iosize(vp); + mp = vnode_mount(vp); + data = fuse_get_mpdata(mp); - if (uio->uio_resid == 0) - return (0); if (uio->uio_offset < 0) return (EINVAL); - bcount = biosize; - filesize = VTOFUD(vp)->filesize; + seqcount = ioflag >> IO_SEQSHIFT; - do { + err = fuse_vnode_size(vp, &filesize, cred, curthread); + if (err) + return err; + + for (err = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if (fuse_isdeadfs(vp)) { err = ENXIO; break; } + if (filesize - uio->uio_offset <= 0) + break; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); - SDT_PROBE3(fuse, , io, read_bio_backend_start, - biosize, (int)lbn, on); - - /* - * Obtain the buffer cache block. Figure out the buffer size - * when we are at EOF. If we are modifying the size of the - * buffer based on an EOF condition we need to hold - * nfs_rslock() through obtaining the buffer to prevent - * a potential writer-appender from messing with n_size. - * Otherwise we may accidentally truncate the buffer and - * lose dirty data. - * - * Note that bcount is *not* DEV_BSIZE aligned. - */ if ((off_t)lbn * biosize >= filesize) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { bcount = filesize - (off_t)lbn *biosize; + } else { + bcount = biosize; } - bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); + nextlbn = lbn + 1; + nextsize = MIN(biosize, filesize - nextlbn * biosize); - if (!bp) - return (EINTR); + SDT_PROBE4(fusefs, , io, read_bio_backend_start, + biosize, (int)lbn, on, bcount); - /* - * If B_CACHE is not set, we must issue the read. If this - * fails, we return an error. - */ + if (bcount < biosize) { + /* If near EOF, don't do readahead */ + err = bread(vp, lbn, bcount, NOCRED, &bp); + } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { + /* Try clustered read */ + long totread = uio->uio_resid + on; + seqcount = MIN(seqcount, + data->max_readahead_blocks + 1); + err = cluster_read(vp, filesize, lbn, bcount, NOCRED, + totread, seqcount, 0, &bp); + } else if (seqcount > 1 && data->max_readahead_blocks >= 1) { + /* Try non-clustered readahead */ + err = breadn(vp, lbn, bcount, &nextlbn, &nextsize, 1, + NOCRED, &bp); + } else { + /* Just read what was requested */ + err = bread(vp, lbn, bcount, NOCRED, &bp); + } - if ((bp->b_flags & B_CACHE) == 0) { - bp->b_iocmd = BIO_READ; - vfs_busy_pages(bp, 0); - err = fuse_io_strategy(vp, bp); - if (err) { - brelse(bp); - return (err); - } + if (err) { + brelse(bp); + bp = NULL; + break; } + /* * on is the offset into the current bp. Figure out how many * bytes we can copy out of the bp. Note that bcount is @@ -264,33 +417,41 @@ */ n = 0; - if (on < bcount) - n = MIN((unsigned)(bcount - on), uio->uio_resid); + if (on < bcount - bp->b_resid) + n = MIN((unsigned)(bcount - bp->b_resid - on), + uio->uio_resid); if (n > 0) { - SDT_PROBE2(fuse, , io, read_bio_backend_feed, - n, n + (int)bp->b_resid); + SDT_PROBE2(fusefs, , io, read_bio_backend_feed, n, bp); err = uiomove(bp->b_data + on, n, uio); } - brelse(bp); - SDT_PROBE3(fuse, , io, read_bio_backend_end, err, - uio->uio_resid, n); - } while (err == 0 && uio->uio_resid > 0 && n > 0); + vfs_bio_brelse(bp, ioflag); + SDT_PROBE4(fusefs, , io, read_bio_backend_end, err, + uio->uio_resid, n, bp); + if (bp->b_resid > 0) { + /* Short read indicates EOF */ + break; + } + } return (err); } -SDT_PROBE_DEFINE1(fuse, , io, read_directbackend_start, "struct fuse_read_in*"); -SDT_PROBE_DEFINE2(fuse, , io, read_directbackend_complete, - "struct fuse_dispatcher*", "struct uio*"); +SDT_PROBE_DEFINE1(fusefs, , io, read_directbackend_start, + "struct fuse_read_in*"); +SDT_PROBE_DEFINE3(fusefs, , io, read_directbackend_complete, + "struct fuse_dispatcher*", "struct fuse_read_in*", "struct uio*"); static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh) { + struct fuse_data *data; struct fuse_dispatcher fdi; struct fuse_read_in *fri; int err = 0; + data = fuse_get_mpdata(vp->v_mount); + if (uio->uio_resid == 0) return (0); @@ -312,19 +473,29 @@ fri->offset = uio->uio_offset; fri->size = MIN(uio->uio_resid, fuse_get_mpdata(vp->v_mount)->max_read); + if (fuse_libabi_geq(data, 7, 9)) { + /* See comment regarding FUSE_WRITE_LOCKOWNER */ + fri->read_flags = 0; + fri->flags = fufh_type_2_fflags(fufh->fufh_type); + } - SDT_PROBE1(fuse, , io, read_directbackend_start, fri); + SDT_PROBE1(fusefs, , io, read_directbackend_start, fri); if ((err = fdisp_wait_answ(&fdi))) goto out; - SDT_PROBE2(fuse, , io, read_directbackend_complete, - fdi.iosize, uio); + SDT_PROBE3(fusefs, , io, read_directbackend_complete, + &fdi, fri, uio); if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) break; - if (fdi.iosize < fri->size) + if (fdi.iosize < fri->size) { + /* + * Short read. Should only happen at EOF or with + * direct io. + */ break; + } } out: @@ -334,25 +505,57 @@ static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) + struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, + int ioflag, bool pages) { struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_data *data; struct fuse_write_in *fwi; + struct fuse_write_out *fwo; struct fuse_dispatcher fdi; size_t chunksize; + void *fwi_data; + off_t as_written_offset; int diff; int err = 0; + bool direct_io = fufh->fuse_open_flags & FOPEN_DIRECT_IO; + bool wrote_anything = false; + uint32_t write_flags; + data = fuse_get_mpdata(vp->v_mount); + + /* + * Don't set FUSE_WRITE_LOCKOWNER in write_flags. It can't be set + * accurately when using POSIX AIO, libfuse doesn't use it, and I'm not + * aware of any file systems that do. It was an attempt to add + * Linux-style mandatory locking to the FUSE protocol, but mandatory + * locking is deprecated even on Linux. See Linux commit + * f33321141b273d60cbb3a8f56a5489baad82ba5e . + */ + /* + * Set FUSE_WRITE_CACHE whenever we don't know the uid, gid, and/or pid + * that originated a write. For example when writing from the + * writeback cache. I don't know of a single file system that cares, + * but the protocol says we're supposed to do this. + */ + write_flags = !pages && ( + (ioflag & IO_DIRECT) || + !fsess_opt_datacache(vnode_mount(vp)) || + !fsess_opt_writeback(vnode_mount(vp))) ? 0 : FUSE_WRITE_CACHE; + if (uio->uio_resid == 0) return (0); + if (ioflag & IO_APPEND) - uio_setoffset(uio, fvdat->filesize); + uio_setoffset(uio, filesize); + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) + return (EFBIG); + fdisp_init(&fdi, 0); while (uio->uio_resid > 0) { - chunksize = MIN(uio->uio_resid, - fuse_get_mpdata(vp->v_mount)->max_write); + chunksize = MIN(uio->uio_resid, data->max_write); fdi.iosize = sizeof(*fwi) + chunksize; fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); @@ -361,79 +564,140 @@ fwi->fh = fufh->fh_id; fwi->offset = uio->uio_offset; fwi->size = chunksize; + fwi->write_flags = write_flags; + if (fuse_libabi_geq(data, 7, 9)) { + fwi->flags = fufh_type_2_fflags(fufh->fufh_type); + fwi_data = (char *)fdi.indata + sizeof(*fwi); + } else { + fwi_data = (char *)fdi.indata + + FUSE_COMPAT_WRITE_IN_SIZE; + } - if ((err = uiomove((char *)fdi.indata + sizeof(*fwi), - chunksize, uio))) + if ((err = uiomove(fwi_data, chunksize, uio))) break; - if ((err = fdisp_wait_answ(&fdi))) +retry: + err = fdisp_wait_answ(&fdi); + if (err == ERESTART || err == EINTR || err == EWOULDBLOCK) { + /* + * Rewind the uio so dofilewrite will know it's + * incomplete + */ + uio->uio_resid += fwi->size; + uio->uio_offset -= fwi->size; + /* + * Change ERESTART into EINTR because we can't rewind + * uio->uio_iov. Basically, once uiomove(9) has been + * called, it's impossible to restart a syscall. + */ + if (err == ERESTART) + err = EINTR; break; + } else if (err) { + break; + } else { + wrote_anything = true; + } + fwo = ((struct fuse_write_out *)fdi.answ); + /* Adjust the uio in the case of short writes */ - diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size; + diff = fwi->size - fwo->size; + as_written_offset = uio->uio_offset - diff; + + if (as_written_offset - diff > filesize) + fuse_vnode_setsize(vp, as_written_offset); + if (as_written_offset - diff >= filesize) + fvdat->flag &= ~FN_SIZECHANGE; + if (diff < 0) { + printf("WARNING: misbehaving FUSE filesystem " + "wrote more data than we provided it\n"); err = EINVAL; break; - } else if (diff > 0 && !(ioflag & IO_DIRECT)) { - /* - * XXX We really should be directly checking whether - * the file was opened with FOPEN_DIRECT_IO, not - * IO_DIRECT. IO_DIRECT can be set in multiple ways. - */ - SDT_PROBE2(fuse, , io, trace, 1, - "misbehaving filesystem: short writes are only " - "allowed with direct_io"); + } else if (diff > 0) { + /* Short write */ + if (!direct_io) { + printf("WARNING: misbehaving FUSE filesystem: " + "short writes are only allowed with " + "direct_io\n"); + } + if (ioflag & IO_DIRECT) { + /* Return early */ + uio->uio_resid += diff; + uio->uio_offset -= diff; + break; + } else { + /* Resend the unwritten portion of data */ + fdi.iosize = sizeof(*fwi) + diff; + /* Refresh fdi without clearing data buffer */ + fdisp_refresh_vp(&fdi, FUSE_WRITE, vp, + uio->uio_td, cred); + fwi = fdi.indata; + MPASS2(fwi == fdi.indata, "FUSE dispatcher " + "reallocated despite no increase in " + "size?"); + void *src = (char*)fwi_data + fwo->size; + memmove(fwi_data, src, diff); + fwi->fh = fufh->fh_id; + fwi->offset = as_written_offset; + fwi->size = diff; + fwi->write_flags = write_flags; + goto retry; + } } - uio->uio_resid += diff; - uio->uio_offset -= diff; - - if (uio->uio_offset > fvdat->filesize && - fuse_data_cache_mode != FUSE_CACHE_UC) { - fuse_vnode_setsize(vp, uio->uio_offset); - fvdat->flag &= ~FN_SIZECHANGE; - } } fdisp_destroy(&fdi); + if (wrote_anything) + fuse_vnode_undirty_cached_timestamps(vp); + return (err); } -SDT_PROBE_DEFINE6(fuse, , io, write_biobackend_start, "int64_t", "int", "int", +SDT_PROBE_DEFINE6(fusefs, , io, write_biobackend_start, "int64_t", "int", "int", "struct uio*", "int", "bool"); -SDT_PROBE_DEFINE2(fuse, , io, write_biobackend_append_race, "long", "int"); +SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_append_race, "long", "int"); +SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_issue, "int", "struct buf*"); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, - struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) + struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct buf *bp; daddr_t lbn; + off_t filesize; int bcount; - int n, on, err = 0; + int n, on, seqcount, err = 0; + bool last_page; const int biosize = fuse_iosize(vp); - KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); + seqcount = ioflag >> IO_SEQSHIFT; + + KASSERT(uio->uio_rw == UIO_WRITE, ("fuse_write_biobackend mode")); if (vp->v_type != VREG) return (EIO); if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (0); + + err = fuse_vnode_size(vp, &filesize, cred, curthread); + if (err) + return err; + if (ioflag & IO_APPEND) - uio_setoffset(uio, fvdat->filesize); + uio_setoffset(uio, filesize); - /* - * Find all of this file's B_NEEDCOMMIT buffers. If our writes - * would exceed the local maximum per-file write commit size when - * combined with those, we must decide whether to flush, - * go synchronous, or return err. We don't bother checking - * IO_UNIT -- we just make all writes atomic anyway, as there's - * no point optimizing for something that really won't ever happen. - */ + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) + return (EFBIG); + do { + bool direct_append, extending; + if (fuse_isdeadfs(vp)) { err = ENXIO; break; @@ -443,66 +707,60 @@ n = MIN((unsigned)(biosize - on), uio->uio_resid); again: - /* - * Handle direct append and file extension cases, calculate - * unaligned buffer size. - */ - if (uio->uio_offset == fvdat->filesize && n) { - /* - * Get the buffer (in its pre-append state to maintain - * B_CACHE if it was previously set). Resize the - * nfsnode after we have locked the buffer to prevent - * readers from reading garbage. - */ - bcount = on; - SDT_PROBE6(fuse, , io, write_biobackend_start, - lbn, on, n, uio, bcount, true); - bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); - + /* Get or create a buffer for the write */ + direct_append = uio->uio_offset == filesize && n; + if (uio->uio_offset + n < filesize) { + extending = false; + if ((off_t)(lbn + 1) * biosize < filesize) { + /* Not the file's last block */ + bcount = biosize; + } else { + /* The file's last block */ + bcount = filesize - (off_t)lbn * biosize; + } + } else { + extending = true; + bcount = on + n; + } + if (howmany(((off_t)lbn * biosize + on + n - 1), PAGE_SIZE) >= + howmany(filesize, PAGE_SIZE)) + last_page = true; + else + last_page = false; + if (direct_append) { + /* + * Take care to preserve the buffer's B_CACHE state so + * as not to cause an unnecessary read. + */ + bp = getblk(vp, lbn, on, PCATCH, 0, 0); if (bp != NULL) { - long save; - - err = fuse_vnode_setsize(vp, - uio->uio_offset + n); - if (err) { - brelse(bp); - break; - } - save = bp->b_flags & B_CACHE; - bcount += n; + uint32_t save = bp->b_flags & B_CACHE; allocbuf(bp, bcount); bp->b_flags |= save; } } else { - /* - * Obtain the locked cache block first, and then - * adjust the file's size as appropriate. - */ - bcount = on + n; - if ((off_t)lbn * biosize + bcount < fvdat->filesize) { - if ((off_t)(lbn + 1) * biosize < fvdat->filesize) - bcount = biosize; - else - bcount = fvdat->filesize - - (off_t)lbn *biosize; - } - SDT_PROBE6(fuse, , io, write_biobackend_start, - lbn, on, n, uio, bcount, false); bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); - if (bp && uio->uio_offset + n > fvdat->filesize) { - err = fuse_vnode_setsize(vp, - uio->uio_offset + n); - if (err) { - brelse(bp); - break; - } - } } - if (!bp) { err = EINTR; break; } + if (extending) { + /* + * Extend file _after_ locking buffer so we won't race + * with other readers + */ + err = fuse_vnode_setsize(vp, uio->uio_offset + n); + filesize = uio->uio_offset + n; + fvdat->flag |= FN_SIZECHANGE; + if (err) { + brelse(bp); + break; + } + } + + SDT_PROBE6(fusefs, , io, write_biobackend_start, + lbn, on, n, uio, bcount, direct_append); /* * Issue a READ if B_CACHE is not set. In special-append * mode, B_CACHE is based on the buffer prior to the write @@ -535,6 +793,21 @@ brelse(bp); break; } + if (bp->b_resid > 0) { + /* + * Short read indicates EOF. Update file size + * from the server and try again. + */ + SDT_PROBE2(fusefs, , io, trace, 1, + "Short read during a RMW"); + brelse(bp); + err = fuse_vnode_size(vp, &filesize, cred, + curthread); + if (err) + break; + else + goto again; + } } if (bp->b_wcred == NOCRED) bp->b_wcred = crhold(cred); @@ -547,9 +820,8 @@ * If the chopping creates a reverse-indexed or degenerate * situation with dirtyoff/end, we 0 both of them. */ - if (bp->b_dirtyend > bcount) { - SDT_PROBE2(fuse, , io, write_biobackend_append_race, + SDT_PROBE2(fusefs, , io, write_biobackend_append_race, (long)bp->b_blkno * biosize, bp->b_dirtyend - bcount); bp->b_dirtyend = bcount; @@ -582,6 +854,7 @@ * reasons: the only way to know if a write is valid * if its actually written out.) */ + SDT_PROBE2(fusefs, , io, write_biobackend_issue, 0, bp); bwrite(bp); if (bp->b_error == EINTR) { err = EINTR; @@ -591,19 +864,12 @@ } err = uiomove((char *)bp->b_data + on, n, uio); - /* - * Since this block is being modified, it must be written - * again and not just committed. Since write clustering does - * not work for the stage 1 data write, only the stage 2 - * commit rpc, we have to clear B_CLUSTEROK as well. - */ - bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); - if (err) { bp->b_ioflags |= BIO_ERROR; bp->b_error = err; brelse(bp); break; + /* TODO: vfs_bio_clrbuf like ffs_write does? */ } /* * Only update dirtyoff/dirtyend if not a degenerate @@ -619,42 +885,85 @@ } vfs_bio_set_valid(bp, on, n); } - err = bwrite(bp); + + vfs_bio_set_flags(bp, ioflag); + + bp->b_flags |= B_FUSEFS_WRITE_CACHE; + if (ioflag & IO_SYNC) { + SDT_PROBE2(fusefs, , io, write_biobackend_issue, 2, bp); + if (!(ioflag & IO_VMIO)) + bp->b_flags &= ~B_FUSEFS_WRITE_CACHE; + err = bwrite(bp); + } else if (vm_page_count_severe() || + buf_dirty_count_severe() || + (ioflag & IO_ASYNC)) { + bp->b_flags |= B_CLUSTEROK; + SDT_PROBE2(fusefs, , io, write_biobackend_issue, 3, bp); + bawrite(bp); + } else if (on == 0 && n == bcount) { + if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { + bp->b_flags |= B_CLUSTEROK; + SDT_PROBE2(fusefs, , io, write_biobackend_issue, + 4, bp); + cluster_write(vp, bp, filesize, seqcount, 0); + } else { + SDT_PROBE2(fusefs, , io, write_biobackend_issue, + 5, bp); + bawrite(bp); + } + } else if (ioflag & IO_DIRECT) { + bp->b_flags |= B_CLUSTEROK; + SDT_PROBE2(fusefs, , io, write_biobackend_issue, 6, bp); + bawrite(bp); + } else { + bp->b_flags &= ~B_CLUSTEROK; + SDT_PROBE2(fusefs, , io, write_biobackend_issue, 7, bp); + bdwrite(bp); + } if (err) break; } while (uio->uio_resid > 0 && n > 0); - if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0) - fuse_vnode_savesize(vp, cred); - return (err); } int fuse_io_strategy(struct vnode *vp, struct buf *bp) { - struct fuse_filehandle *fufh; struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_filehandle *fufh; struct ucred *cred; struct uio *uiop; struct uio uio; struct iovec io; + off_t filesize; int error = 0; + int fflag; + /* We don't know the true pid when we're dealing with the cache */ + pid_t pid = 0; const int biosize = fuse_iosize(vp); MPASS(vp->v_type == VREG || vp->v_type == VDIR); MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); - error = fuse_filehandle_getrw(vp, - (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); + fflag = bp->b_iocmd == BIO_READ ? FREAD : FWRITE; + cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; + error = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); + if (bp->b_iocmd == BIO_READ && error == EBADF) { + /* + * This may be a read-modify-write operation on a cached file + * opened O_WRONLY. The FUSE protocol allows this. + */ + error = fuse_filehandle_get(vp, FWRITE, &fufh, cred, pid); + } if (error) { printf("FUSE: strategy: filehandles are closed\n"); bp->b_ioflags |= BIO_ERROR; bp->b_error = error; + bufdone(bp); return (error); } - cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; uiop = &uio; uiop->uio_iov = &io; @@ -673,40 +982,57 @@ KASSERT(!(bp->b_flags & B_DONE), ("fuse_io_strategy: bp %p already marked done", bp)); if (bp->b_iocmd == BIO_READ) { + ssize_t left; + io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; - uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; + uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize; error = fuse_read_directbackend(vp, uiop, cred, fufh); + /* + * Store the amount we failed to read in the buffer's private + * field, so callers can truncate the file if necessary' + */ - /* XXXCEM: Potentially invalid access to cached_attrs here */ - if ((!error && uiop->uio_resid) || - (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO && - uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 && - uiop->uio_offset >= fvdat->cached_attrs.va_size)) { - /* - * If we had a short read with no error, we must have - * hit a file hole. We should zero-fill the remainder. - * This can also occur if the server hits the file EOF. - * - * Holes used to be able to occur due to pending - * writes, but that is not possible any longer. - */ + if (!error && uiop->uio_resid) { int nread = bp->b_bcount - uiop->uio_resid; - int left = uiop->uio_resid; + left = uiop->uio_resid; + bzero((char *)bp->b_data + nread, left); - if (error != 0) { - printf("FUSE: Fix broken io: offset %ju, " - " resid %zd, file size %ju/%ju\n", - (uintmax_t)uiop->uio_offset, - uiop->uio_resid, fvdat->filesize, - fvdat->cached_attrs.va_size); - error = 0; + if ((fvdat->flag & FN_SIZECHANGE) == 0) { + /* + * A short read with no error, when not using + * direct io, and when no writes are cached, + * indicates EOF caused by a server-side + * truncation. Clear the attr cache so we'll + * pick up the new file size and timestamps. + * + * We must still bzero the remaining buffer so + * uninitialized data doesn't get exposed by a + * future truncate that extends the file. + * + * To prevent lock order problems, we must + * truncate the file upstack, not here. + */ + SDT_PROBE2(fusefs, , io, trace, 1, + "Short read of a clean file"); + fuse_vnode_clear_attr_cache(vp); + } else { + /* + * If dirty writes _are_ cached beyond EOF, + * that indicates a newly created hole that the + * server doesn't know about. Those don't pose + * any problem. + * XXX: we don't currently track whether dirty + * writes are cached beyond EOF, before EOF, or + * both. + */ + SDT_PROBE2(fusefs, , io, trace, 1, + "Short read of a dirty file"); + uiop->uio_resid = 0; } - if (left > 0) - bzero((char *)bp->b_data + nread, left); - uiop->uio_resid = 0; + } if (error) { bp->b_ioflags |= BIO_ERROR; @@ -714,33 +1040,33 @@ } } else { /* - * If we only need to commit, try to commit - */ - if (bp->b_flags & B_NEEDCOMMIT) { - SDT_PROBE2(fuse, , io, trace, 1, - "write: B_NEEDCOMMIT flags set"); - } - /* * Setup for actual write */ - if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > - fvdat->filesize) - bp->b_dirtyend = fvdat->filesize - - (off_t)bp->b_blkno * biosize; + error = fuse_vnode_size(vp, &filesize, cred, curthread); + if (error) { + bp->b_ioflags |= BIO_ERROR; + bp->b_error = error; + bufdone(bp); + return (error); + } + if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize) + bp->b_dirtyend = filesize - + (off_t)bp->b_lblkno * biosize; + if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; - uiop->uio_offset = (off_t)bp->b_blkno * biosize + uiop->uio_offset = (off_t)bp->b_lblkno * biosize + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; - error = fuse_write_directbackend(vp, uiop, cred, fufh, 0); + bool pages = bp->b_flags & B_FUSEFS_WRITE_CACHE; + error = fuse_write_directbackend(vp, uiop, cred, fufh, + filesize, 0, pages); - if (error == EINTR || error == ETIMEDOUT - || (!error && (bp->b_flags & B_NEEDCOMMIT))) { - + if (error == EINTR || error == ETIMEDOUT) { bp->b_flags &= ~(B_INVAL | B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); Index: sys/fs/fuse/fuse_ipc.h =================================================================== --- sys/fs/fuse/fuse_ipc.h +++ sys/fs/fuse/fuse_ipc.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -63,6 +68,12 @@ #include #include +enum fuse_data_cache_mode { + FUSE_CACHE_UC, + FUSE_CACHE_WT, + FUSE_CACHE_WB, +}; + struct fuse_iov { void *base; size_t len; @@ -103,6 +114,12 @@ struct fuse_data *tk_data; int tk_flag; u_int tk_refcount; + /* + * If this ticket's operation has been interrupted, this will hold the + * unique value of the FUSE_INTERRUPT operation. Otherwise, it will be + * 0. + */ + uint64_t irq_unique; /* fields for initiating an upgoing message */ struct fuse_iov tk_ms_fiov; @@ -147,16 +164,20 @@ ftick->tk_flag |= FT_ANSW; } +static inline struct fuse_in_header* +fticket_in_header(struct fuse_ticket *ftick) +{ + return (struct fuse_in_header *)(ftick->tk_ms_fiov.base); +} + static inline enum fuse_opcode fticket_opcode(struct fuse_ticket *ftick) { - return (((struct fuse_in_header *)(ftick->tk_ms_fiov.base))->opcode); + return fticket_in_header(ftick)->opcode; } int fticket_pull(struct fuse_ticket *ftick, struct uio *uio); -enum mountpri { FM_NOMOUNTED, FM_PRIMARY, FM_SECONDARY }; - /* * The data representing a FUSE session. */ @@ -170,10 +191,16 @@ struct mtx ms_mtx; STAILQ_HEAD(, fuse_ticket) ms_head; + int ms_count; struct mtx aw_mtx; TAILQ_HEAD(, fuse_ticket) aw_head; + /* + * Holds the next value of the FUSE operation unique value. + * Also, serves as a wakeup channel to prevent any operations from + * being created before INIT completes. + */ u_long ticketer; struct sx rename_lock; @@ -181,6 +208,7 @@ uint32_t fuse_libabi_major; uint32_t fuse_libabi_minor; + uint32_t max_readahead_blocks; uint32_t max_write; uint32_t max_read; uint32_t subtype; @@ -189,34 +217,27 @@ struct selinfo ks_rsel; int daemon_timeout; + unsigned time_gran; uint64_t notimpl; + uint64_t mnt_flag; + enum fuse_data_cache_mode cache_mode; }; #define FSESS_DEAD 0x0001 /* session is to be closed */ -#define FSESS_UNUSED0 0x0002 /* unused */ #define FSESS_INITED 0x0004 /* session has been inited */ #define FSESS_DAEMON_CAN_SPY 0x0010 /* let non-owners access this fs */ /* (and being observed by the daemon) */ #define FSESS_PUSH_SYMLINKS_IN 0x0020 /* prefix absolute symlinks with mp */ #define FSESS_DEFAULT_PERMISSIONS 0x0040 /* kernel does permission checking */ -#define FSESS_NO_ATTRCACHE 0x0080 /* no attribute caching */ -#define FSESS_NO_READAHEAD 0x0100 /* no readaheads */ -#define FSESS_NO_DATACACHE 0x0200 /* disable buffer cache */ -#define FSESS_NO_NAMECACHE 0x0400 /* disable name cache */ -#define FSESS_NO_MMAP 0x0800 /* disable mmap */ -#define FSESS_BROKENIO 0x1000 /* fix broken io */ +#define FSESS_ASYNC_READ 0x1000 /* allow multiple reads of some file */ +#define FSESS_POSIX_LOCKS 0x2000 /* daemon supports POSIX locks */ +#define FSESS_EXPORT_SUPPORT 0x10000 /* daemon supports NFS-style lookups */ +#define FSESS_INTR 0x20000 /* interruptible mounts */ +#define FSESS_MNTOPTS_MASK ( \ + FSESS_DAEMON_CAN_SPY | FSESS_PUSH_SYMLINKS_IN | \ + FSESS_DEFAULT_PERMISSIONS | FSESS_INTR) -enum fuse_data_cache_mode { - FUSE_CACHE_UC, - FUSE_CACHE_WT, - FUSE_CACHE_WB, -}; - extern int fuse_data_cache_mode; -extern int fuse_data_cache_invalidate; -extern int fuse_mmap_enable; -extern int fuse_sync_resize; -extern int fuse_fix_broken_io; static inline struct fuse_data * fuse_get_mpdata(struct mount *mp) @@ -245,36 +266,43 @@ { struct fuse_data *data = fuse_get_mpdata(mp); - return (fuse_data_cache_mode != FUSE_CACHE_UC && - (data->dataflags & FSESS_NO_DATACACHE) == 0); + return (data->cache_mode != FUSE_CACHE_UC); } static inline bool fsess_opt_mmap(struct mount *mp) { - struct fuse_data *data = fuse_get_mpdata(mp); - - if (!fuse_mmap_enable || fuse_data_cache_mode == FUSE_CACHE_UC) - return (false); - return ((data->dataflags & (FSESS_NO_DATACACHE | FSESS_NO_MMAP)) == 0); + return (fsess_opt_datacache(mp)); } static inline bool -fsess_opt_brokenio(struct mount *mp) +fsess_opt_writeback(struct mount *mp) { struct fuse_data *data = fuse_get_mpdata(mp); - return (fuse_fix_broken_io || (data->dataflags & FSESS_BROKENIO)); + return (data->cache_mode == FUSE_CACHE_WB); } +/* Insert a new upgoing message */ static inline void fuse_ms_push(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED); refcount_acquire(&ftick->tk_refcount); STAILQ_INSERT_TAIL(&ftick->tk_data->ms_head, ftick, tk_ms_link); + ftick->tk_data->ms_count++; } +/* Insert a new upgoing message to the front of the queue */ +static inline void +fuse_ms_push_head(struct fuse_ticket *ftick) +{ + mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED); + refcount_acquire(&ftick->tk_refcount); + STAILQ_INSERT_HEAD(&ftick->tk_data->ms_head, ftick, tk_ms_link); + ftick->tk_data->ms_count++; +} + static inline struct fuse_ticket * fuse_ms_pop(struct fuse_data *data) { @@ -284,7 +312,9 @@ if ((ftick = STAILQ_FIRST(&data->ms_head))) { STAILQ_REMOVE_HEAD(&data->ms_head, tk_ms_link); + data->ms_count--; #ifdef INVARIANTS + MPASS(data->ms_count >= 0); ftick->tk_ms_link.stqe_next = NULL; #endif } @@ -327,7 +357,7 @@ struct fuse_ticket *fuse_ticket_fetch(struct fuse_data *data); int fuse_ticket_drop(struct fuse_ticket *ftick); void fuse_insert_callback(struct fuse_ticket *ftick, fuse_handler_t *handler); -void fuse_insert_message(struct fuse_ticket *ftick); +void fuse_insert_message(struct fuse_ticket *ftick, bool irq); static inline bool fuse_libabi_geq(struct fuse_data *data, uint32_t abi_maj, uint32_t abi_min) @@ -374,13 +404,15 @@ #endif } +void fdisp_refresh(struct fuse_dispatcher *fdip); + void fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred); -void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, - struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred); - void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, + struct vnode *vp, struct thread *td, struct ucred *cred); + +void fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred); int fdisp_wait_answ(struct fuse_dispatcher *fdip); Index: sys/fs/fuse/fuse_ipc.c =================================================================== --- sys/fs/fuse/fuse_ipc.c +++ sys/fs/fuse/fuse_ipc.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -61,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -84,14 +90,17 @@ #include "fuse_ipc.h" #include "fuse_internal.h" -SDT_PROVIDER_DECLARE(fuse); +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , ipc, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , ipc, trace, "int", "char*"); +static void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, + struct fuse_data *data, uint64_t nid, pid_t pid, struct ucred *cred); +static void fuse_interrupt_send(struct fuse_ticket *otick, int err); static struct fuse_ticket *fticket_alloc(struct fuse_data *data); static void fticket_refresh(struct fuse_ticket *ftick); static void fticket_destroy(struct fuse_ticket *ftick); @@ -104,13 +113,10 @@ static fuse_handler_t fuse_standard_handler; -SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW, 0, "FUSE tunables"); -SYSCTL_STRING(_vfs_fusefs, OID_AUTO, version, CTLFLAG_RD, - FUSE_FREEBSD_VERSION, 0, "fuse-freebsd version"); -static int fuse_ticket_count = 0; +static counter_u64_t fuse_ticket_count; +SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, ticket_count, CTLFLAG_RD, + &fuse_ticket_count, "Number of allocated tickets"); -SYSCTL_INT(_vfs_fusefs, OID_AUTO, ticket_count, CTLFLAG_RW, - &fuse_ticket_count, 0, "number of allocated tickets"); static long fuse_iov_permanent_bufsize = 1 << 19; SYSCTL_LONG(_vfs_fusefs, OID_AUTO, iov_permanent_bufsize, CTLFLAG_RW, @@ -125,25 +131,131 @@ MALLOC_DEFINE(M_FUSEMSG, "fuse_msgbuf", "fuse message buffer"); static uma_zone_t ticket_zone; -static void -fuse_block_sigs(sigset_t *oldset) +/* + * TODO: figure out how to timeout INTERRUPT requests, because the daemon may + * leagally never respond + */ +static int +fuse_interrupt_callback(struct fuse_ticket *tick, struct uio *uio) { - sigset_t newset; + struct fuse_ticket *otick, *x_tick; + struct fuse_interrupt_in *fii; + struct fuse_data *data = tick->tk_data; + bool found = false; - SIGFILLSET(newset); - SIGDELSET(newset, SIGKILL); - if (kern_sigprocmask(curthread, SIG_BLOCK, &newset, oldset, 0)) - panic("%s: Invalid operation for kern_sigprocmask()", - __func__); + fii = (struct fuse_interrupt_in*)((char*)tick->tk_ms_fiov.base + + sizeof(struct fuse_in_header)); + + fuse_lck_mtx_lock(data->aw_mtx); + TAILQ_FOREACH_SAFE(otick, &data->aw_head, tk_aw_link, x_tick) { + if (otick->tk_unique == fii->unique) { + found = true; + break; + } + } + fuse_lck_mtx_unlock(data->aw_mtx); + + if (!found) { + /* Original is already complete. Just return */ + return 0; + } + + /* Clear the original ticket's interrupt association */ + otick->irq_unique = 0; + + if (tick->tk_aw_ohead.error == ENOSYS) { + fsess_set_notimpl(data->mp, FUSE_INTERRUPT); + return 0; + } else if (tick->tk_aw_ohead.error == EAGAIN) { + /* + * There are two reasons we might get this: + * 1) the daemon received the INTERRUPT request before the + * original, or + * 2) the daemon received the INTERRUPT request after it + * completed the original request. + * In the first case we should re-send the INTERRUPT. In the + * second, we should ignore it. + */ + /* Resend */ + fuse_interrupt_send(otick, EINTR); + return 0; + } else { + /* Illegal FUSE_INTERRUPT response */ + return EINVAL; + } } -static void -fuse_restore_sigs(sigset_t *oldset) +/* Interrupt the operation otick. Return err as its error code */ +void +fuse_interrupt_send(struct fuse_ticket *otick, int err) { + struct fuse_dispatcher fdi; + struct fuse_interrupt_in *fii; + struct fuse_in_header *ftick_hdr; + struct fuse_data *data = otick->tk_data; + struct fuse_ticket *tick, *xtick; + struct ucred reused_creds; + gid_t reused_groups[1]; - if (kern_sigprocmask(curthread, SIG_SETMASK, oldset, NULL, 0)) - panic("%s: Invalid operation for kern_sigprocmask()", - __func__); + if (otick->irq_unique == 0) { + /* + * If the daemon hasn't yet received otick, then we can answer + * it ourselves and return. + */ + fuse_lck_mtx_lock(data->ms_mtx); + STAILQ_FOREACH_SAFE(tick, &otick->tk_data->ms_head, tk_ms_link, + xtick) { + if (tick == otick) { + STAILQ_REMOVE(&otick->tk_data->ms_head, tick, + fuse_ticket, tk_ms_link); + otick->tk_data->ms_count--; + otick->tk_ms_link.stqe_next = NULL; + fuse_lck_mtx_unlock(data->ms_mtx); + + fuse_lck_mtx_lock(otick->tk_aw_mtx); + if (!fticket_answered(otick)) { + fticket_set_answered(otick); + otick->tk_aw_errno = err; + wakeup(otick); + } + fuse_lck_mtx_unlock(otick->tk_aw_mtx); + + fuse_ticket_drop(tick); + return; + } + } + fuse_lck_mtx_unlock(data->ms_mtx); + + /* + * If the fuse daemon doesn't support interrupts, then there's + * nothing more that we can do + */ + if (!fsess_isimpl(data->mp, FUSE_INTERRUPT)) + return; + + /* + * If the fuse daemon has already received otick, then we must + * send FUSE_INTERRUPT. + */ + ftick_hdr = fticket_in_header(otick); + reused_creds.cr_uid = ftick_hdr->uid; + reused_groups[0] = ftick_hdr->gid; + reused_creds.cr_groups = reused_groups; + fdisp_init(&fdi, sizeof(*fii)); + fdisp_make_pid(&fdi, FUSE_INTERRUPT, data, ftick_hdr->nodeid, + ftick_hdr->pid, &reused_creds); + + fii = fdi.indata; + fii->unique = otick->tk_unique; + fuse_insert_callback(fdi.tick, fuse_interrupt_callback); + + otick->irq_unique = fdi.tick->tk_unique; + /* Interrupt ops should be delivered ASAP */ + fuse_insert_message(fdi.tick, true); + fdisp_destroy(&fdi); + } else { + /* This ticket has already been interrupted */ + } } void @@ -181,14 +293,19 @@ } fiov->allocated_size = FU_AT_LEAST(size); fiov->credit = fuse_iov_credit; + /* Clear data buffer after reallocation */ + bzero(fiov->base, size); + } else if (size > fiov->len) { + /* Clear newly extended portion of data buffer */ + bzero((char*)fiov->base + fiov->len, size - fiov->len); } fiov->len = size; } +/* Resize the fiov if needed, and clear it's buffer */ void fiov_refresh(struct fuse_iov *fiov) { - bzero(fiov->base, fiov->len); fiov_adjust(fiov, 0); } @@ -211,8 +328,10 @@ if (ftick->tk_unique == 0) ftick->tk_unique = atomic_fetchadd_long(&data->ticketer, 1); + ftick->irq_unique = 0; + refcount_init(&ftick->tk_refcount, 1); - atomic_add_acq_int(&fuse_ticket_count, 1); + counter_u64_add(fuse_ticket_count, 1); return 0; } @@ -227,7 +346,7 @@ FUSE_ASSERT_MS_DONE(ftick); FUSE_ASSERT_AW_DONE(ftick); - atomic_subtract_acq_int(&fuse_ticket_count, 1); + counter_u64_add(fuse_ticket_count, -1); } static int @@ -269,7 +388,7 @@ return uma_zfree(ticket_zone, ftick); } -static inline +static inline void fticket_refresh(struct fuse_ticket *ftick) { @@ -292,30 +411,65 @@ ftick->tk_flag = 0; } +/* Prepar the ticket to be reused, but don't clear its data buffers */ +static inline void +fticket_reset(struct fuse_ticket *ftick) +{ + FUSE_ASSERT_MS_DONE(ftick); + FUSE_ASSERT_AW_DONE(ftick); + + ftick->tk_ms_bufdata = NULL; + ftick->tk_ms_bufsize = 0; + ftick->tk_ms_type = FT_M_FIOV; + + bzero(&ftick->tk_aw_ohead, sizeof(struct fuse_out_header)); + + ftick->tk_aw_errno = 0; + ftick->tk_aw_bufdata = NULL; + ftick->tk_aw_bufsize = 0; + ftick->tk_aw_type = FT_A_FIOV; + + ftick->tk_flag = 0; +} + static int fticket_wait_answer(struct fuse_ticket *ftick) { - sigset_t tset; - int err = 0; - struct fuse_data *data; + struct thread *td = curthread; + sigset_t blockedset, oldset; + int err = 0, stops_deferred; + struct fuse_data *data = ftick->tk_data; + bool interrupted = false; + if (fsess_isimpl(ftick->tk_data->mp, FUSE_INTERRUPT) && + data->dataflags & FSESS_INTR) { + SIGEMPTYSET(blockedset); + } else { + /* Block all signals except (implicitly) SIGKILL */ + SIGFILLSET(blockedset); + } + stops_deferred = sigdeferstop(SIGDEFERSTOP_SILENT); + kern_sigprocmask(td, SIG_BLOCK, NULL, &oldset, 0); + fuse_lck_mtx_lock(ftick->tk_aw_mtx); +retry: if (fticket_answered(ftick)) { goto out; } - data = ftick->tk_data; if (fdata_get_dead(data)) { err = ENOTCONN; fticket_set_answered(ftick); goto out; } - fuse_block_sigs(&tset); + kern_sigprocmask(td, SIG_BLOCK, &blockedset, NULL, 0); err = msleep(ftick, &ftick->tk_aw_mtx, PCATCH, "fu_ans", data->daemon_timeout * hz); - fuse_restore_sigs(&tset); - if (err == EAGAIN) { /* same as EWOULDBLOCK */ + kern_sigprocmask(td, SIG_SETMASK, &oldset, NULL, 0); + if (err == EWOULDBLOCK) { + SDT_PROBE2(fusefs, , ipc, trace, 3, + "fticket_wait_answer: EWOULDBLOCK"); #ifdef XXXIP /* die conditionally */ if (!fdata_get_dead(data)) { fdata_set_dead(data); @@ -323,14 +477,64 @@ #endif err = ETIMEDOUT; fticket_set_answered(ftick); + } else if ((err == EINTR || err == ERESTART)) { + /* + * Whether we get EINTR or ERESTART depends on whether + * SA_RESTART was set by sigaction(2). + * + * Try to interrupt the operation and wait for an EINTR response + * to the original operation. If the file system does not + * support FUSE_INTERRUPT, then we'll just wait for it to + * complete like normal. If it does support FUSE_INTERRUPT, + * then it will either respond EINTR to the original operation, + * or EAGAIN to the interrupt. + */ + sigset_t tmpset; + + SDT_PROBE2(fusefs, , ipc, trace, 4, + "fticket_wait_answer: interrupt"); + fuse_lck_mtx_unlock(ftick->tk_aw_mtx); + fuse_interrupt_send(ftick, err); + + PROC_LOCK(td->td_proc); + mtx_lock(&td->td_proc->p_sigacts->ps_mtx); + tmpset = td->td_proc->p_siglist; + SIGSETOR(tmpset, td->td_siglist); + mtx_unlock(&td->td_proc->p_sigacts->ps_mtx); + PROC_UNLOCK(td->td_proc); + + fuse_lck_mtx_lock(ftick->tk_aw_mtx); + if (!interrupted && !SIGISMEMBER(tmpset, SIGKILL)) { + /* + * Block all signals while we wait for an interrupt + * response. The protocol doesn't discriminate between + * different signals. + */ + SIGFILLSET(blockedset); + interrupted = true; + goto retry; + } else { + /* + * Return immediately for fatal signals, or if this is + * the second interruption. We should only be + * interrupted twice if the thread is stopped, for + * example during sigexit. + */ + } + } else if (err) { + SDT_PROBE2(fusefs, , ipc, trace, 6, + "fticket_wait_answer: other error"); + } else { + SDT_PROBE2(fusefs, , ipc, trace, 7, "fticket_wait_answer: OK"); } out: if (!(err || fticket_answered(ftick))) { - SDT_PROBE2(fuse, , ipc, trace, 1, + SDT_PROBE2(fusefs, , ipc, trace, 1, "FUSE: requester was woken up but still no answer"); err = ENXIO; } fuse_lck_mtx_unlock(ftick->tk_aw_mtx); + sigallowstop(stops_deferred); return err; } @@ -386,6 +590,8 @@ data->fdev = fdev; mtx_init(&data->ms_mtx, "fuse message list mutex", NULL, MTX_DEF); STAILQ_INIT(&data->ms_head); + data->ms_count = 0; + knlist_init_mtx(&data->ks_rsel.si_note, &data->ms_mtx); mtx_init(&data->aw_mtx, "fuse answer list mutex", NULL, MTX_DEF); TAILQ_INIT(&data->aw_head); data->daemoncred = crhold(cred); @@ -405,11 +611,12 @@ return; /* Driving off stage all that stuff thrown at device... */ - mtx_destroy(&data->ms_mtx); - mtx_destroy(&data->aw_mtx); sx_destroy(&data->rename_lock); - crfree(data->daemoncred); + mtx_destroy(&data->aw_mtx); + knlist_delete(&data->ks_rsel.si_note, curthread, 0); + knlist_destroy(&data->ks_rsel.si_note); + mtx_destroy(&data->ms_mtx); free(data, M_FUSEMSG); } @@ -478,8 +685,14 @@ fuse_lck_mtx_unlock(ftick->tk_data->aw_mtx); } +/* + * Insert a new upgoing ticket into the message queue + * + * If urgent is true, insert at the front of the queue. Otherwise, insert in + * FIFO order. + */ void -fuse_insert_message(struct fuse_ticket *ftick) +fuse_insert_message(struct fuse_ticket *ftick, bool urgent) { if (ftick->tk_flag & FT_DIRTY) { panic("FUSE: ticket reused without being refreshed"); @@ -490,9 +703,13 @@ return; } fuse_lck_mtx_lock(ftick->tk_data->ms_mtx); - fuse_ms_push(ftick); + if (urgent) + fuse_ms_push_head(ftick); + else + fuse_ms_push(ftick); wakeup_one(ftick->tk_data); selwakeuppri(&ftick->tk_data->ks_rsel, PZERO + 1); + KNOTE_LOCKED(&ftick->tk_data->ks_rsel.si_note, 0); fuse_lck_mtx_unlock(ftick->tk_data->ms_mtx); } @@ -505,8 +722,21 @@ opcode = fticket_opcode(ftick); switch (opcode) { + case FUSE_BMAP: + err = (blen == sizeof(struct fuse_bmap_out)) ? 0 : EINVAL; + break; + + case FUSE_LINK: case FUSE_LOOKUP: - err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; + case FUSE_MKDIR: + case FUSE_MKNOD: + case FUSE_SYMLINK: + if (fuse_libabi_geq(ftick->tk_data, 7, 9)) { + err = (blen == sizeof(struct fuse_entry_out)) ? + 0 : EINVAL; + } else { + err = (blen == FUSE_COMPAT_ENTRY_OUT_SIZE) ? 0 : EINVAL; + } break; case FUSE_FORGET: @@ -514,29 +744,19 @@ break; case FUSE_GETATTR: - err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL; - break; - case FUSE_SETATTR: - err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL; + if (fuse_libabi_geq(ftick->tk_data, 7, 9)) { + err = (blen == sizeof(struct fuse_attr_out)) ? + 0 : EINVAL; + } else { + err = (blen == FUSE_COMPAT_ATTR_OUT_SIZE) ? 0 : EINVAL; + } break; case FUSE_READLINK: err = (PAGE_SIZE >= blen) ? 0 : EINVAL; break; - case FUSE_SYMLINK: - err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; - break; - - case FUSE_MKNOD: - err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; - break; - - case FUSE_MKDIR: - err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; - break; - case FUSE_UNLINK: err = (blen == 0) ? 0 : EINVAL; break; @@ -549,10 +769,6 @@ err = (blen == 0) ? 0 : EINVAL; break; - case FUSE_LINK: - err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; - break; - case FUSE_OPEN: err = (blen == sizeof(struct fuse_open_out)) ? 0 : EINVAL; break; @@ -607,7 +823,9 @@ break; case FUSE_INIT: - if (blen == sizeof(struct fuse_init_out) || blen == 8) { + if (blen == sizeof(struct fuse_init_out) || + blen == FUSE_COMPAT_INIT_OUT_SIZE || + blen == FUSE_COMPAT_22_INIT_OUT_SIZE) { err = 0; } else { err = EINVAL; @@ -634,15 +852,15 @@ break; case FUSE_GETLK: - panic("FUSE: no response body format check for FUSE_GETLK"); + err = (blen == sizeof(struct fuse_lk_out)) ? 0 : EINVAL; break; case FUSE_SETLK: - panic("FUSE: no response body format check for FUSE_SETLK"); + err = (blen == 0) ? 0 : EINVAL; break; case FUSE_SETLKW: - panic("FUSE: no response body format check for FUSE_SETLKW"); + err = (blen == 0) ? 0 : EINVAL; break; case FUSE_ACCESS: @@ -650,8 +868,13 @@ break; case FUSE_CREATE: - err = (blen == sizeof(struct fuse_entry_out) + - sizeof(struct fuse_open_out)) ? 0 : EINVAL; + if (fuse_libabi_geq(ftick->tk_data, 7, 9)) { + err = (blen == sizeof(struct fuse_entry_out) + + sizeof(struct fuse_open_out)) ? 0 : EINVAL; + } else { + err = (blen == FUSE_COMPAT_ENTRY_OUT_SIZE + + sizeof(struct fuse_open_out)) ? 0 : EINVAL; + } break; case FUSE_DESTROY: @@ -677,7 +900,7 @@ ihead->pid = pid; ihead->uid = cred->cr_uid; - ihead->gid = cred->cr_rgid; + ihead->gid = cred->cr_groups[0]; } /* @@ -705,18 +928,38 @@ return err; } -void -fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, +/* + * Reinitialize a dispatcher from a pid and node id, without resizing or + * clearing its data buffers + */ +static void +fdisp_refresh_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred) { - struct fuse_data *data = fuse_get_mpdata(mp); + MPASS(fdip->tick); + MPASS2(sizeof(fdip->finh) + fdip->iosize <= fdip->tick->tk_ms_fiov.len, + "Must use fdisp_make_pid to increase the size of the fiov"); + fticket_reset(fdip->tick); + FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh, + fdip->indata, fdip->iosize); + + fuse_setup_ihead(fdip->finh, fdip->tick, nid, op, fdip->iosize, pid, + cred); +} + +/* Initialize a dispatcher from a pid and node id */ +static void +fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, + struct fuse_data *data, uint64_t nid, pid_t pid, struct ucred *cred) +{ if (fdip->tick) { fticket_refresh(fdip->tick); } else { fdip->tick = fuse_ticket_fetch(data); } + /* FUSE_DIMALLOC will bzero the fiovs when it enlarges them */ FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh, fdip->indata, fdip->iosize); @@ -727,22 +970,42 @@ fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred) { + struct fuse_data *data = fuse_get_mpdata(mp); RECTIFY_TDCR(td, cred); - return fdisp_make_pid(fdip, op, mp, nid, td->td_proc->p_pid, cred); + return fdisp_make_pid(fdip, op, data, nid, td->td_proc->p_pid, cred); } void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred) { + struct mount *mp = vnode_mount(vp); + struct fuse_data *data = fuse_get_mpdata(mp); + RECTIFY_TDCR(td, cred); - return fdisp_make_pid(fdip, op, vnode_mount(vp), VTOI(vp), + return fdisp_make_pid(fdip, op, data, VTOI(vp), td->td_proc->p_pid, cred); } -SDT_PROBE_DEFINE2(fuse, , ipc, fdisp_wait_answ_error, "char*", "int"); +/* Refresh a fuse_dispatcher so it can be reused, but don't zero its data */ +void +fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, + struct vnode *vp, struct thread *td, struct ucred *cred) +{ + RECTIFY_TDCR(td, cred); + return fdisp_refresh_pid(fdip, op, vnode_mount(vp), VTOI(vp), + td->td_proc->p_pid, cred); +} +void +fdisp_refresh(struct fuse_dispatcher *fdip) +{ + fticket_refresh(fdip->tick); +} + +SDT_PROBE_DEFINE2(fusefs, , ipc, fdisp_wait_answ_error, "char*", "int"); + int fdisp_wait_answ(struct fuse_dispatcher *fdip) { @@ -750,7 +1013,7 @@ fdip->answ_stat = 0; fuse_insert_callback(fdip->tick, fuse_standard_handler); - fuse_insert_message(fdip->tick); + fuse_insert_message(fdip->tick, false); if ((err = fticket_wait_answer(fdip->tick))) { fuse_lck_mtx_lock(fdip->tick->tk_aw_mtx); @@ -761,7 +1024,7 @@ * the standard handler has completed his job. * So we drop the ticket and exit as usual. */ - SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, + SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error, "IPC: interrupted, already answered", err); fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx); goto out; @@ -771,7 +1034,7 @@ * Then by setting the answered flag we get *him* * to drop the ticket. */ - SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, + SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error, "IPC: interrupted, setting to answered", err); fticket_set_answered(fdip->tick); fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx); @@ -779,14 +1042,22 @@ } } - if (fdip->tick->tk_aw_errno) { - SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, + if (fdip->tick->tk_aw_errno == ENOTCONN) { + /* The daemon died while we were waiting for a response */ + err = ENOTCONN; + goto out; + } else if (fdip->tick->tk_aw_errno) { + /* + * There was some sort of communication error with the daemon + * that the client wouldn't understand. + */ + SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error, "IPC: explicit EIO-ing", fdip->tick->tk_aw_errno); err = EIO; goto out; } if ((err = fdip->tick->tk_aw_ohead.error)) { - SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, + SDT_PROBE2(fusefs, , ipc, fdisp_wait_answ_error, "IPC: setting status", fdip->tick->tk_aw_ohead.error); /* * This means a "proper" fuse syscall error. @@ -815,10 +1086,12 @@ ticket_zone = uma_zcreate("fuse_ticket", sizeof(struct fuse_ticket), fticket_ctor, fticket_dtor, fticket_init, fticket_fini, UMA_ALIGN_PTR, 0); + fuse_ticket_count = counter_u64_alloc(M_WAITOK); } void fuse_ipc_destroy(void) { + counter_u64_free(fuse_ticket_count); uma_zdestroy(ticket_zone); } Index: sys/fs/fuse/fuse_kernel.h =================================================================== --- sys/fs/fuse/fuse_kernel.h +++ sys/fs/fuse/fuse_kernel.h @@ -1,6 +1,6 @@ /*-- * This file defines the kernel interface of FUSE - * Copyright (C) 2001-2007 Miklos Szeredi + * Copyright (C) 2001-2008 Miklos Szeredi * * This program can be distributed under the terms of the GNU GPL. * See the file COPYING. @@ -34,69 +34,134 @@ * $FreeBSD$ */ -#ifndef linux -#include -#define __u64 uint64_t -#define __u32 uint32_t -#define __s32 int32_t +/* + * This file defines the kernel interface of FUSE + * + * Protocol changelog: + * + * 7.9: + * - new fuse_getattr_in input argument of GETATTR + * - add lk_flags in fuse_lk_in + * - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in + * - add blksize field to fuse_attr + * - add file flags field to fuse_read_in and fuse_write_in + * + * 7.10 + * - add nonseekable open flag + * + * 7.11 + * - add IOCTL message + * - add unsolicited notification support + * + * 7.12 + * - add umask flag to input argument of open, mknod and mkdir + * - add notification messages for invalidation of inodes and + * directory entries + * + * 7.13 + * - make max number of background requests and congestion threshold + * tunables + * + * 7.14 + * - add splice support to fuse device + * + * 7.15 + * - add store notify + * - add retrieve notify + * + * 7.16 + * - add BATCH_FORGET request + * - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct + * fuse_ioctl_iovec' instead of ambiguous 'struct iovec' + * - add FUSE_IOCTL_32BIT flag + * + * 7.17 + * - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK + * + * 7.18 + * - add FUSE_IOCTL_DIR flag + * - add FUSE_NOTIFY_DELETE + * + * 7.19 + * - add FUSE_FALLOCATE + * + * 7.20 + * - add FUSE_AUTO_INVAL_DATA + * 7.21 + * - add FUSE_READDIRPLUS + * - send the requested events in POLL request + * + * 7.22 + * - add FUSE_ASYNC_DIO + * + * 7.23 + * - add FUSE_WRITEBACK_CACHE + * - add time_gran to fuse_init_out + * - add reserved space to fuse_init_out + * - add FATTR_CTIME + * - add ctime and ctimensec to fuse_setattr_in + * - add FUSE_RENAME2 request + * - add FUSE_NO_OPEN_SUPPORT flag + */ + +#ifndef _FUSE_FUSE_KERNEL_H +#define _FUSE_FUSE_KERNEL_H + +#ifdef __linux__ +#include #else -#include -#include +#include #endif /** Version number of this interface */ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 8 +#define FUSE_KERNEL_MINOR_VERSION 23 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 -/** The major number of the fuse character device */ -#define FUSE_MAJOR MISC_MAJOR - -/** The minor number of the fuse character device */ -#define FUSE_MINOR 229 - /* Make sure all structures are padded to 64bit boundary, so 32bit userspace works under 64bit kernels */ struct fuse_attr { - __u64 ino; - __u64 size; - __u64 blocks; - __u64 atime; - __u64 mtime; - __u64 ctime; - __u32 atimensec; - __u32 mtimensec; - __u32 ctimensec; - __u32 mode; - __u32 nlink; - __u32 uid; - __u32 gid; - __u32 rdev; + uint64_t ino; + uint64_t size; + uint64_t blocks; + uint64_t atime; + uint64_t mtime; + uint64_t ctime; + uint32_t atimensec; + uint32_t mtimensec; + uint32_t ctimensec; + uint32_t mode; + uint32_t nlink; + uint32_t uid; + uint32_t gid; + uint32_t rdev; + uint32_t blksize; + uint32_t padding; }; struct fuse_kstatfs { - __u64 blocks; - __u64 bfree; - __u64 bavail; - __u64 files; - __u64 ffree; - __u32 bsize; - __u32 namelen; - __u32 frsize; - __u32 padding; - __u32 spare[6]; + uint64_t blocks; + uint64_t bfree; + uint64_t bavail; + uint64_t files; + uint64_t ffree; + uint32_t bsize; + uint32_t namelen; + uint32_t frsize; + uint32_t padding; + uint32_t spare[6]; }; struct fuse_file_lock { - __u64 start; - __u64 end; - __u32 type; - __u32 pid; /* tgid */ + uint64_t start; + uint64_t end; + uint32_t type; + uint32_t pid; /* tgid */ }; /** @@ -109,27 +174,128 @@ #define FATTR_ATIME (1 << 4) #define FATTR_MTIME (1 << 5) #define FATTR_FH (1 << 6) +#define FATTR_ATIME_NOW (1 << 7) +#define FATTR_MTIME_NOW (1 << 8) +#define FATTR_LOCKOWNER (1 << 9) +#define FATTR_CTIME (1 << 10) /** * Flags returned by the OPEN request * * FOPEN_DIRECT_IO: bypass page cache for this open file * FOPEN_KEEP_CACHE: don't invalidate the data cache on open + * FOPEN_NONSEEKABLE: the file is not seekable */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) +#define FOPEN_NONSEEKABLE (1 << 2) /** * INIT request/reply flags + * + * FUSE_ASYNC_READ: asynchronous read requests + * FUSE_POSIX_LOCKS: remote locking for POSIX file locks + * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported) + * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem + * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." + * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB + * FUSE_DONT_MASK: don't apply umask to file mode on create operations + * FUSE_SPLICE_WRITE: kernel supports splice write on the device + * FUSE_SPLICE_MOVE: kernel supports splice move on the device + * FUSE_SPLICE_READ: kernel supports splice read on the device + * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks + * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories + * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages + * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one) + * FUSE_READDIRPLUS_AUTO: adaptive readdirplus + * FUSE_ASYNC_DIO: asynchronous direct I/O submission + * FUSE_WRITEBACK_CACHE: use writeback cache for buffered writes + * FUSE_NO_OPEN_SUPPORT: kernel supports zero-message opens */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) +#define FUSE_FILE_OPS (1 << 2) +#define FUSE_ATOMIC_O_TRUNC (1 << 3) +#define FUSE_EXPORT_SUPPORT (1 << 4) +#define FUSE_BIG_WRITES (1 << 5) +#define FUSE_DONT_MASK (1 << 6) +#define FUSE_SPLICE_WRITE (1 << 7) +#define FUSE_SPLICE_MOVE (1 << 8) +#define FUSE_SPLICE_READ (1 << 9) +#define FUSE_FLOCK_LOCKS (1 << 10) +#define FUSE_HAS_IOCTL_DIR (1 << 11) +#define FUSE_AUTO_INVAL_DATA (1 << 12) +#define FUSE_DO_READDIRPLUS (1 << 13) +#define FUSE_READDIRPLUS_AUTO (1 << 14) +#define FUSE_ASYNC_DIO (1 << 15) +#define FUSE_WRITEBACK_CACHE (1 << 16) +#define FUSE_NO_OPEN_SUPPORT (1 << 17) +#ifdef linux /** + * CUSE INIT request/reply flags + * + * CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl + */ +#define CUSE_UNRESTRICTED_IOCTL (1 << 0) +#endif /* linux */ + +/** * Release flags */ #define FUSE_RELEASE_FLUSH (1 << 0) +#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1) +/** + * Getattr flags + */ +#define FUSE_GETATTR_FH (1 << 0) + +/** + * Lock flags + */ +#define FUSE_LK_FLOCK (1 << 0) + +/** + * WRITE flags + * + * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed + * FUSE_WRITE_LOCKOWNER: lock_owner field is valid + */ +#define FUSE_WRITE_CACHE (1 << 0) +#define FUSE_WRITE_LOCKOWNER (1 << 1) + +/** + * Read flags + */ +#define FUSE_READ_LOCKOWNER (1 << 1) + +/** + * Ioctl flags + * + * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine + * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed + * FUSE_IOCTL_RETRY: retry with new iovecs + * FUSE_IOCTL_32BIT: 32bit ioctl + * FUSE_IOCTL_DIR: is a directory + * + * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs + */ +#define FUSE_IOCTL_COMPAT (1 << 0) +#define FUSE_IOCTL_UNRESTRICTED (1 << 1) +#define FUSE_IOCTL_RETRY (1 << 2) +#define FUSE_IOCTL_32BIT (1 << 3) +#define FUSE_IOCTL_DIR (1 << 4) + +#define FUSE_IOCTL_MAX_IOV 256 + +/** + * Poll flags + * + * FUSE_POLL_SCHEDULE_NOTIFY: request poll notify + */ +#define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0) + enum fuse_opcode { FUSE_LOOKUP = 1, FUSE_FORGET = 2, /* no reply */ @@ -167,107 +333,179 @@ FUSE_INTERRUPT = 36, FUSE_BMAP = 37, FUSE_DESTROY = 38, + FUSE_IOCTL = 39, + FUSE_POLL = 40, + FUSE_NOTIFY_REPLY = 41, + FUSE_BATCH_FORGET = 42, + FUSE_FALLOCATE = 43, + FUSE_READDIRPLUS = 44, + FUSE_RENAME2 = 45, + +#ifdef linux + /* CUSE specific operations */ + CUSE_INIT = 4096, +#endif /* linux */ }; +enum fuse_notify_code { + FUSE_NOTIFY_POLL = 1, + FUSE_NOTIFY_INVAL_INODE = 2, + FUSE_NOTIFY_INVAL_ENTRY = 3, + FUSE_NOTIFY_STORE = 4, + FUSE_NOTIFY_RETRIEVE = 5, + FUSE_NOTIFY_DELETE = 6, + FUSE_NOTIFY_CODE_MAX, +}; + /* The read buffer is required to be at least 8k, but may be much larger */ #define FUSE_MIN_READ_BUFFER 8192 +#define FUSE_COMPAT_ENTRY_OUT_SIZE 120 + struct fuse_entry_out { - __u64 nodeid; /* Inode ID */ - __u64 generation; /* Inode generation: nodeid:gen must - be unique for the fs's lifetime */ - __u64 entry_valid; /* Cache timeout for the name */ - __u64 attr_valid; /* Cache timeout for the attributes */ - __u32 entry_valid_nsec; - __u32 attr_valid_nsec; + uint64_t nodeid; /* Inode ID */ + uint64_t generation; /* Inode generation: nodeid:gen must + be unique for the fs's lifetime */ + uint64_t entry_valid; /* Cache timeout for the name */ + uint64_t attr_valid; /* Cache timeout for the attributes */ + uint32_t entry_valid_nsec; + uint32_t attr_valid_nsec; struct fuse_attr attr; }; struct fuse_forget_in { - __u64 nlookup; + uint64_t nlookup; }; +struct fuse_forget_one { + uint64_t nodeid; + uint64_t nlookup; +}; + +struct fuse_batch_forget_in { + uint32_t count; + uint32_t dummy; +}; + +struct fuse_getattr_in { + uint32_t getattr_flags; + uint32_t dummy; + uint64_t fh; +}; + +#define FUSE_COMPAT_ATTR_OUT_SIZE 96 + struct fuse_attr_out { - __u64 attr_valid; /* Cache timeout for the attributes */ - __u32 attr_valid_nsec; - __u32 dummy; + uint64_t attr_valid; /* Cache timeout for the attributes */ + uint32_t attr_valid_nsec; + uint32_t dummy; struct fuse_attr attr; }; +#define FUSE_COMPAT_MKNOD_IN_SIZE 8 + +struct fuse_mknod_in { + uint32_t mode; + uint32_t rdev; + uint32_t umask; + uint32_t padding; +}; + struct fuse_mkdir_in { - __u32 mode; - __u32 padding; + uint32_t mode; + uint32_t umask; }; struct fuse_rename_in { - __u64 newdir; + uint64_t newdir; }; +struct fuse_rename2_in { + uint64_t newdir; + uint32_t flags; + uint32_t padding; +}; + struct fuse_link_in { - __u64 oldnodeid; + uint64_t oldnodeid; }; struct fuse_setattr_in { - __u32 valid; - __u32 padding; - __u64 fh; - __u64 size; - __u64 unused1; - __u64 atime; - __u64 mtime; - __u64 unused2; - __u32 atimensec; - __u32 mtimensec; - __u32 unused3; - __u32 mode; - __u32 unused4; - __u32 uid; - __u32 gid; - __u32 unused5; + uint32_t valid; + uint32_t padding; + uint64_t fh; + uint64_t size; + uint64_t lock_owner; + uint64_t atime; + uint64_t mtime; + uint64_t ctime; + uint32_t atimensec; + uint32_t mtimensec; + uint32_t ctimensec; + uint32_t mode; + uint32_t unused4; + uint32_t uid; + uint32_t gid; + uint32_t unused5; }; struct fuse_open_in { - __u32 flags; - __u32 mode; + uint32_t flags; + uint32_t unused; }; +struct fuse_create_in { + uint32_t flags; + uint32_t mode; + uint32_t umask; + uint32_t padding; +}; + struct fuse_open_out { - __u64 fh; - __u32 open_flags; - __u32 padding; + uint64_t fh; + uint32_t open_flags; + uint32_t padding; }; struct fuse_release_in { - __u64 fh; - __u32 flags; - __u32 release_flags; - __u64 lock_owner; + uint64_t fh; + uint32_t flags; + uint32_t release_flags; + uint64_t lock_owner; }; struct fuse_flush_in { - __u64 fh; - __u32 unused; - __u32 padding; - __u64 lock_owner; + uint64_t fh; + uint32_t unused; + uint32_t padding; + uint64_t lock_owner; }; struct fuse_read_in { - __u64 fh; - __u64 offset; - __u32 size; - __u32 padding; + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t read_flags; + uint64_t lock_owner; + uint32_t flags; + uint32_t padding; }; +#define FUSE_COMPAT_WRITE_IN_SIZE 24 + struct fuse_write_in { - __u64 fh; - __u64 offset; - __u32 size; - __u32 write_flags; + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t write_flags; + uint64_t lock_owner; + uint32_t flags; + uint32_t padding; }; struct fuse_write_out { - __u32 size; - __u32 padding; + uint32_t size; + uint32_t padding; }; #define FUSE_COMPAT_STATFS_SIZE 48 @@ -277,40 +515,42 @@ }; struct fuse_fsync_in { - __u64 fh; - __u32 fsync_flags; - __u32 padding; + uint64_t fh; + uint32_t fsync_flags; + uint32_t padding; }; +struct fuse_setxattr_in { + uint32_t size; + uint32_t flags; +}; + struct fuse_listxattr_in { - __u32 size; - __u32 flags; + uint32_t size; + uint32_t padding; }; struct fuse_listxattr_out { - __u32 size; - __u32 flags; + uint32_t size; + uint32_t padding; }; struct fuse_getxattr_in { - __u32 size; - __u32 padding; + uint32_t size; + uint32_t padding; }; struct fuse_getxattr_out { - __u32 size; - __u32 padding; + uint32_t size; + uint32_t padding; }; -struct fuse_setxattr_in { - __u32 size; - __u32 flags; -}; - struct fuse_lk_in { - __u64 fh; - __u64 owner; + uint64_t fh; + uint64_t owner; struct fuse_file_lock lk; + uint32_t lk_flags; + uint32_t padding; }; struct fuse_lk_out { @@ -318,66 +558,197 @@ }; struct fuse_access_in { - __u32 mask; - __u32 padding; + uint32_t mask; + uint32_t padding; }; struct fuse_init_in { - __u32 major; - __u32 minor; - __u32 max_readahead; - __u32 flags; + uint32_t major; + uint32_t minor; + uint32_t max_readahead; + uint32_t flags; }; +#define FUSE_COMPAT_INIT_OUT_SIZE 8 +#define FUSE_COMPAT_22_INIT_OUT_SIZE 24 + struct fuse_init_out { - __u32 major; - __u32 minor; - __u32 max_readahead; - __u32 flags; - __u32 unused; - __u32 max_write; + uint32_t major; + uint32_t minor; + uint32_t max_readahead; + uint32_t flags; + uint16_t max_background; + uint16_t congestion_threshold; + uint32_t max_write; + uint32_t time_gran; + uint32_t unused[9]; }; +#ifdef linux +#define CUSE_INIT_INFO_MAX 4096 + +struct cuse_init_in { + uint32_t major; + uint32_t minor; + uint32_t unused; + uint32_t flags; +}; + +struct cuse_init_out { + uint32_t major; + uint32_t minor; + uint32_t unused; + uint32_t flags; + uint32_t max_read; + uint32_t max_write; + uint32_t dev_major; /* chardev major */ + uint32_t dev_minor; /* chardev minor */ + uint32_t spare[10]; +}; +#endif /* linux */ + struct fuse_interrupt_in { - __u64 unique; + uint64_t unique; }; struct fuse_bmap_in { - __u64 block; - __u32 blocksize; - __u32 padding; + uint64_t block; + uint32_t blocksize; + uint32_t padding; }; struct fuse_bmap_out { - __u64 block; + uint64_t block; }; +struct fuse_ioctl_in { + uint64_t fh; + uint32_t flags; + uint32_t cmd; + uint64_t arg; + uint32_t in_size; + uint32_t out_size; +}; + +struct fuse_ioctl_iovec { + uint64_t base; + uint64_t len; +}; + +struct fuse_ioctl_out { + int32_t result; + uint32_t flags; + uint32_t in_iovs; + uint32_t out_iovs; +}; + +struct fuse_poll_in { + uint64_t fh; + uint64_t kh; + uint32_t flags; + uint32_t events; +}; + +struct fuse_poll_out { + uint32_t revents; + uint32_t padding; +}; + +struct fuse_notify_poll_wakeup_out { + uint64_t kh; +}; + +struct fuse_fallocate_in { + uint64_t fh; + uint64_t offset; + uint64_t length; + uint32_t mode; + uint32_t padding; +}; + struct fuse_in_header { - __u32 len; - __u32 opcode; - __u64 unique; - __u64 nodeid; - __u32 uid; - __u32 gid; - __u32 pid; - __u32 padding; + uint32_t len; + uint32_t opcode; + uint64_t unique; + uint64_t nodeid; + uint32_t uid; + uint32_t gid; + uint32_t pid; + uint32_t padding; }; struct fuse_out_header { - __u32 len; - __s32 error; - __u64 unique; + uint32_t len; + int32_t error; + uint64_t unique; }; struct fuse_dirent { - __u64 ino; - __u64 off; - __u32 namelen; - __u32 type; - char name[0]; + uint64_t ino; + uint64_t off; + uint32_t namelen; + uint32_t type; + char name[]; }; #define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) -#define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1)) +#define FUSE_DIRENT_ALIGN(x) \ + (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1)) #define FUSE_DIRENT_SIZE(d) \ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) + +struct fuse_direntplus { + struct fuse_entry_out entry_out; + struct fuse_dirent dirent; +}; + +#define FUSE_NAME_OFFSET_DIRENTPLUS \ + offsetof(struct fuse_direntplus, dirent.name) +#define FUSE_DIRENTPLUS_SIZE(d) \ + FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen) + +struct fuse_notify_inval_inode_out { + uint64_t ino; + int64_t off; + int64_t len; +}; + +struct fuse_notify_inval_entry_out { + uint64_t parent; + uint32_t namelen; + uint32_t padding; +}; + +struct fuse_notify_delete_out { + uint64_t parent; + uint64_t child; + uint32_t namelen; + uint32_t padding; +}; + +struct fuse_notify_store_out { + uint64_t nodeid; + uint64_t offset; + uint32_t size; + uint32_t padding; +}; + +struct fuse_notify_retrieve_out { + uint64_t notify_unique; + uint64_t nodeid; + uint64_t offset; + uint32_t size; + uint32_t padding; +}; + +/* Matches the size of fuse_write_in */ +struct fuse_notify_retrieve_in { + uint64_t dummy1; + uint64_t offset; + uint32_t size; + uint32_t dummy2; + uint64_t dummy3; + uint64_t dummy4; +}; + +#endif /* _FUSE_FUSE_KERNEL_H */ Index: sys/fs/fuse/fuse_main.c =================================================================== --- sys/fs/fuse/fuse_main.c +++ sys/fs/fuse/fuse_main.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -77,6 +82,10 @@ #include #include "fuse.h" +#include "fuse_file.h" +#include "fuse_ipc.h" +#include "fuse_internal.h" +#include "fuse_node.h" static void fuse_bringdown(eventhandler_tag eh_tag); static int fuse_loader(struct module *m, int what, void *arg); @@ -85,7 +94,7 @@ extern struct vfsops fuse_vfsops; extern struct cdevsw fuse_cdevsw; -extern struct vop_vector fuse_vnops; +extern struct vop_vector fuse_fifonops; extern uma_zone_t fuse_pbuf_zone; static struct vfsconf fuse_vfsconf = { @@ -96,11 +105,13 @@ .vfc_flags = VFCF_JAIL | VFCF_SYNTHETIC }; +SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW, 0, "FUSE tunables"); +SYSCTL_NODE(_vfs_fusefs, OID_AUTO, stats, CTLFLAG_RW, 0, "FUSE statistics"); SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_major, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FUSE_KERNEL_VERSION, "FUSE kernel abi major version"); SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_minor, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FUSE_KERNEL_MINOR_VERSION, "FUSE kernel abi minor version"); -SDT_PROVIDER_DEFINE(fuse); +SDT_PROVIDER_DEFINE(fusefs); /****************************** * @@ -111,7 +122,9 @@ static void fuse_bringdown(eventhandler_tag eh_tag) { - + fuse_node_destroy(); + fuse_internal_destroy(); + fuse_file_destroy(); fuse_ipc_destroy(); fuse_device_destroy(); mtx_destroy(&fuse_mtx); @@ -132,16 +145,14 @@ return (err); } fuse_ipc_init(); + fuse_file_init(); + fuse_internal_init(); + fuse_node_init(); fuse_pbuf_zone = pbuf_zsecond_create("fusepbuf", nswbuf / 2); /* vfs_modevent ignores its first arg */ if ((err = vfs_modevent(NULL, what, &fuse_vfsconf))) fuse_bringdown(eh_tag); - else - printf("fuse-freebsd: version %s, FUSE ABI %d.%d\n", - FUSE_FREEBSD_VERSION, - FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); - break; case MOD_UNLOAD: if ((err = vfs_modevent(NULL, what, &fuse_vfsconf))) Index: sys/fs/fuse/fuse_node.h =================================================================== --- sys/fs/fuse/fuse_node.h +++ sys/fs/fuse/fuse_node.h @@ -32,6 +32,11 @@ * * Copyright (C) 2005 Csaba Henk. * All rights reserved. + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -60,60 +65,121 @@ #ifndef _FUSE_NODE_H_ #define _FUSE_NODE_H_ +#include #include #include #include "fuse_file.h" -#define FN_REVOKED 0x00000020 -#define FN_FLUSHINPROG 0x00000040 -#define FN_FLUSHWANT 0x00000080 -#define FN_SIZECHANGE 0x00000100 -#define FN_DIRECTIO 0x00000200 +#define FN_REVOKED 0x00000020 +#define FN_FLUSHINPROG 0x00000040 +#define FN_FLUSHWANT 0x00000080 +/* + * Indicates that the file's size is dirty; the kernel has changed it but not + * yet send the change to the daemon. When this bit is set, the + * cache_attrs.va_size field does not time out. + */ +#define FN_SIZECHANGE 0x00000100 +#define FN_DIRECTIO 0x00000200 +/* Indicates that parent_nid is valid */ +#define FN_PARENT_NID 0x00000400 +/* + * Indicates that the file's cached timestamps are dirty. They will be flushed + * during the next SETATTR or WRITE. Until then, the cached fields will not + * time out. + */ +#define FN_MTIMECHANGE 0x00000800 +#define FN_CTIMECHANGE 0x00001000 + struct fuse_vnode_data { /** self **/ uint64_t nid; + uint64_t generation; /** parent **/ - /* XXXIP very likely to be stale, it's not updated in rename() */ uint64_t parent_nid; /** I/O **/ - struct fuse_filehandle fufh[FUFH_MAXTYPE]; + /* List of file handles for all of the vnode's open file descriptors */ + LIST_HEAD(, fuse_filehandle) handles; /** flags **/ uint32_t flag; /** meta **/ - bool valid_attr_cache; + /* The monotonic time after which the attr cache is invalid */ + struct bintime attr_cache_timeout; + /* + * Monotonic time after which the entry is invalid. Used for lookups + * by nodeid instead of pathname. + */ + struct bintime entry_cache_timeout; struct vattr cached_attrs; - off_t filesize; uint64_t nlookup; enum vtype vtype; }; +/* + * This overlays the fid structure (see mount.h). Mostly the same as the types + * used by UFS and ext2. + */ +struct fuse_fid { + uint16_t len; /* Length of structure. */ + uint16_t pad; /* Force 32-bit alignment. */ + uint32_t gen; /* Generation number. */ + uint64_t nid; /* FUSE node id. */ +}; + #define VTOFUD(vp) \ ((struct fuse_vnode_data *)((vp)->v_data)) #define VTOI(vp) (VTOFUD(vp)->nid) -#define VTOVA(vp) \ - (VTOFUD(vp)->valid_attr_cache ? \ - &(VTOFUD(vp)->cached_attrs) : NULL) +static inline struct vattr* +VTOVA(struct vnode *vp) +{ + struct bintime now; + + getbinuptime(&now); + if (bintime_cmp(&(VTOFUD(vp)->attr_cache_timeout), &now, >)) + return &(VTOFUD(vp)->cached_attrs); + else + return NULL; +} + +static inline void +fuse_vnode_clear_attr_cache(struct vnode *vp) +{ + bintime_clear(&VTOFUD(vp)->attr_cache_timeout); +} + +static uint32_t inline +fuse_vnode_hash(uint64_t id) +{ + return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT)); +} + #define VTOILLU(vp) ((uint64_t)(VTOFUD(vp) ? VTOI(vp) : 0)) #define FUSE_NULL_ID 0 +extern struct vop_vector fuse_fifoops; extern struct vop_vector fuse_vnops; +int fuse_vnode_cmp(struct vnode *vp, void *nidp); + static inline void fuse_vnode_setparent(struct vnode *vp, struct vnode *dvp) { if (dvp != NULL && vp->v_type == VDIR) { MPASS(dvp->v_type == VDIR); VTOFUD(vp)->parent_nid = VTOI(dvp); + VTOFUD(vp)->flag |= FN_PARENT_NID; } } +int fuse_vnode_size(struct vnode *vp, off_t *filesize, struct ucred *cred, + struct thread *td); + void fuse_vnode_destroy(struct vnode *vp); int fuse_vnode_get(struct mount *mp, struct fuse_entry_out *feo, @@ -123,10 +189,14 @@ void fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags, struct thread *td); -void fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred); +int fuse_vnode_savesize(struct vnode *vp, struct ucred *cred, pid_t pid); -int fuse_vnode_savesize(struct vnode *vp, struct ucred *cred); - int fuse_vnode_setsize(struct vnode *vp, off_t newsize); +void fuse_vnode_undirty_cached_timestamps(struct vnode *vp); + +void fuse_vnode_update(struct vnode *vp, int flags); + +void fuse_node_init(void); +void fuse_node_destroy(void); #endif /* _FUSE_NODE_H_ */ Index: sys/fs/fuse/fuse_node.c =================================================================== --- sys/fs/fuse/fuse_node.c +++ sys/fs/fuse/fuse_node.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -59,8 +64,9 @@ __FBSDID("$FreeBSD$"); #include -#include #include +#include +#include #include #include #include @@ -77,8 +83,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -89,65 +95,40 @@ #include "fuse_io.h" #include "fuse_ipc.h" -SDT_PROVIDER_DECLARE(fuse); +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , node, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , node, trace, "int", "char*"); MALLOC_DEFINE(M_FUSEVN, "fuse_vnode", "fuse vnode private data"); static int sysctl_fuse_cache_mode(SYSCTL_HANDLER_ARGS); -static int fuse_node_count = 0; +static counter_u64_t fuse_node_count; -SYSCTL_INT(_vfs_fusefs, OID_AUTO, node_count, CTLFLAG_RD, - &fuse_node_count, 0, "Count of FUSE vnodes"); +SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, node_count, CTLFLAG_RD, + &fuse_node_count, "Count of FUSE vnodes"); int fuse_data_cache_mode = FUSE_CACHE_WT; +/* + * DEPRECATED + * This sysctl is no longer needed as of fuse protocol 7.23. Individual + * servers can select the cache behavior they need for each mountpoint: + * - writethrough: the default + * - writeback: set FUSE_WRITEBACK_CACHE in fuse_init_out.flags + * - uncached: set FOPEN_DIRECT_IO for every file + * The sysctl is retained primarily for use by jails supporting older FUSE + * protocols. It may be removed entirely once FreeBSD 11.3 and 12.0 are EOL. + */ SYSCTL_PROC(_vfs_fusefs, OID_AUTO, data_cache_mode, CTLTYPE_INT|CTLFLAG_RW, &fuse_data_cache_mode, 0, sysctl_fuse_cache_mode, "I", "Zero: disable caching of FUSE file data; One: write-through caching " "(default); Two: write-back caching (generally unsafe)"); -int fuse_data_cache_invalidate = 0; - -SYSCTL_INT(_vfs_fusefs, OID_AUTO, data_cache_invalidate, CTLFLAG_RW, - &fuse_data_cache_invalidate, 0, - "If non-zero, discard cached clean file data when there are no active file" - " users"); - -int fuse_mmap_enable = 1; - -SYSCTL_INT(_vfs_fusefs, OID_AUTO, mmap_enable, CTLFLAG_RW, - &fuse_mmap_enable, 0, - "If non-zero, and data_cache_mode is also non-zero, enable mmap(2) of " - "FUSE files"); - -int fuse_refresh_size = 0; - -SYSCTL_INT(_vfs_fusefs, OID_AUTO, refresh_size, CTLFLAG_RW, - &fuse_refresh_size, 0, - "If non-zero, and no dirty file extension data is buffered, fetch file " - "size before write operations"); - -int fuse_sync_resize = 1; - -SYSCTL_INT(_vfs_fusefs, OID_AUTO, sync_resize, CTLFLAG_RW, - &fuse_sync_resize, 0, - "If a cached write extended a file, inform FUSE filesystem of the changed" - "size immediately subsequent to the issued writes"); - -int fuse_fix_broken_io = 0; - -SYSCTL_INT(_vfs_fusefs, OID_AUTO, fix_broken_io, CTLFLAG_RW, - &fuse_fix_broken_io, 0, - "If non-zero, print a diagnostic warning if a userspace filesystem returns" - " EIO on reads of recently extended portions of files"); - static int sysctl_fuse_cache_mode(SYSCTL_HANDLER_ARGS) { @@ -174,9 +155,8 @@ fuse_vnode_init(struct vnode *vp, struct fuse_vnode_data *fvdat, uint64_t nodeid, enum vtype vtyp) { - int i; - fvdat->nid = nodeid; + LIST_INIT(&fvdat->handles); vattr_null(&fvdat->cached_attrs); if (nodeid == FUSE_ROOT_ID) { vp->v_vflag |= VV_ROOT; @@ -184,10 +164,7 @@ vp->v_type = vtyp; vp->v_data = fvdat; - for (i = 0; i < FUFH_MAXTYPE; i++) - fvdat->fufh[i].fh_type = FUFH_INVALID; - - atomic_add_acq_int(&fuse_node_count, 1); + counter_u64_add(fuse_node_count, 1); } void @@ -196,23 +173,21 @@ struct fuse_vnode_data *fvdat = vp->v_data; vp->v_data = NULL; + KASSERT(LIST_EMPTY(&fvdat->handles), + ("Destroying fuse vnode with open files!")); free(fvdat, M_FUSEVN); - atomic_subtract_acq_int(&fuse_node_count, 1); + counter_u64_add(fuse_node_count, -1); } -static int +int fuse_vnode_cmp(struct vnode *vp, void *nidp) { return (VTOI(vp) != *((uint64_t *)nidp)); } -static uint32_t inline -fuse_vnode_hash(uint64_t id) -{ - return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT)); -} - +SDT_PROBE_DEFINE3(fusefs, , node, stale_vnode, "struct vnode*", "enum vtype", + "uint64_t"); static int fuse_vnode_alloc(struct mount *mp, struct thread *td, @@ -220,10 +195,12 @@ enum vtype vtyp, struct vnode **vpp) { + struct fuse_data *data; struct fuse_vnode_data *fvdat; struct vnode *vp2; int err = 0; + data = fuse_get_mpdata(mp); if (vtyp == VNON) { return EINVAL; } @@ -234,12 +211,34 @@ return (err); if (*vpp) { - MPASS((*vpp)->v_type == vtyp && (*vpp)->v_data != NULL); - SDT_PROBE2(fuse, , node, trace, 1, "vnode taken from hash"); + if ((*vpp)->v_type != vtyp) { + /* + * STALE vnode! This probably indicates a buggy + * server, but it could also be the result of a race + * between FUSE_LOOKUP and another client's + * FUSE_UNLINK/FUSE_CREATE + */ + SDT_PROBE3(fusefs, , node, stale_vnode, *vpp, vtyp, + nodeid); + fuse_internal_vnode_disappear(*vpp); + lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL); + *vpp = NULL; + return (EAGAIN); + } + MPASS((*vpp)->v_data != NULL); + MPASS(VTOFUD(*vpp)->nid == nodeid); + SDT_PROBE2(fusefs, , node, trace, 1, "vnode taken from hash"); return (0); } fvdat = malloc(sizeof(*fvdat), M_FUSEVN, M_WAITOK | M_ZERO); - err = getnewvnode("fuse", mp, &fuse_vnops, vpp); + switch (vtyp) { + case VFIFO: + err = getnewvnode("fuse", mp, &fuse_fifoops, vpp); + break; + default: + err = getnewvnode("fuse", mp, &fuse_vnops, vpp); + break; + } if (err) { free(fvdat, M_FUSEVN); return (err); @@ -249,14 +248,23 @@ err = insmntque(*vpp, mp); ASSERT_VOP_ELOCKED(*vpp, "fuse_vnode_alloc"); if (err) { + lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL); free(fvdat, M_FUSEVN); *vpp = NULL; return (err); } + /* Disallow async reads for fifos because UFS does. I don't know why */ + if (data->dataflags & FSESS_ASYNC_READ && vtyp != VFIFO) + VN_LOCK_ASHARE(*vpp); + err = vfs_hash_insert(*vpp, fuse_vnode_hash(nodeid), LK_EXCLUSIVE, td, &vp2, fuse_vnode_cmp, &nodeid); - if (err) + if (err) { + lockmgr((*vpp)->v_vnlock, LK_RELEASE, NULL); + free(fvdat, M_FUSEVN); + *vpp = NULL; return (err); + } if (vp2 != NULL) { *vpp = vp2; return (0); @@ -277,6 +285,11 @@ enum vtype vtyp) { struct thread *td = (cnp != NULL ? cnp->cn_thread : curthread); + /* + * feo should only be NULL for the root directory, which (when libfuse + * is used) always has generation 0 + */ + uint64_t generation = feo ? feo->generation : 0; int err = 0; err = fuse_vnode_alloc(mp, td, nodeid, vtyp, vpp); @@ -284,22 +297,28 @@ return err; } if (dvp != NULL) { - MPASS((cnp->cn_flags & ISDOTDOT) == 0); - MPASS(!(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')); + MPASS(cnp && (cnp->cn_flags & ISDOTDOT) == 0); + MPASS(cnp && + !(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')); fuse_vnode_setparent(*vpp, dvp); } if (dvp != NULL && cnp != NULL && (cnp->cn_flags & MAKEENTRY) != 0 && feo != NULL && (feo->entry_valid != 0 || feo->entry_valid_nsec != 0)) { + struct timespec timeout; + ASSERT_VOP_LOCKED(*vpp, "fuse_vnode_get"); ASSERT_VOP_LOCKED(dvp, "fuse_vnode_get"); - cache_enter(dvp, *vpp, cnp); + + fuse_validity_2_timespec(feo, &timeout); + cache_enter_time(dvp, *vpp, cnp, &timeout, NULL); } + VTOFUD(*vpp)->generation = generation; /* * In userland, libfuse uses cached lookups for dot and dotdot entries, * thus it does not really bump the nlookup counter for forget. - * Follow the same semantic and avoid tu bump it in order to keep + * Follow the same semantic and avoid the bump in order to keep * nlookup counters consistent. */ if (cnp == NULL || ((cnp->cn_flags & ISDOTDOT) == 0 && @@ -309,44 +328,19 @@ return 0; } +/* + * Called for every fusefs vnode open to initialize the vnode (not + * fuse_filehandle) for use + */ void fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags, struct thread *td) { - /* - * Funcation is called for every vnode open. - * Merge fuse_open_flags it may be 0 - */ - /* - * Ideally speaking, direct io should be enabled on - * fd's but do not see of any way of providing that - * this implementation. - * - * Also cannot think of a reason why would two - * different fd's on same vnode would like - * have DIRECT_IO turned on and off. But linux - * based implementation works on an fd not an - * inode and provides such a feature. - * - * XXXIP: Handle fd based DIRECT_IO - */ - if (fuse_open_flags & FOPEN_DIRECT_IO) { - ASSERT_VOP_ELOCKED(vp, __func__); - VTOFUD(vp)->flag |= FN_DIRECTIO; - fuse_io_invalbuf(vp, td); - } else { - if ((fuse_open_flags & FOPEN_KEEP_CACHE) == 0) - fuse_io_invalbuf(vp, td); - VTOFUD(vp)->flag &= ~FN_DIRECTIO; - } - - if (vnode_vtype(vp) == VREG) { - /* XXXIP prevent getattr, by using cached node size */ + if (vnode_vtype(vp) == VREG) vnode_create_vobject(vp, 0, td); - } } int -fuse_vnode_savesize(struct vnode *vp, struct ucred *cred) +fuse_vnode_savesize(struct vnode *vp, struct ucred *cred, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct thread *td = curthread; @@ -375,10 +369,11 @@ fsai->valid = 0; /* Truncate to a new value. */ - fsai->size = fvdat->filesize; + MPASS((fvdat->flag & FN_SIZECHANGE) != 0); + fsai->size = fvdat->cached_attrs.va_size; fsai->valid |= FATTR_SIZE; - fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh); + fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); if (fufh) { fsai->fh = fufh->fh_id; fsai->valid |= FATTR_FH; @@ -391,38 +386,115 @@ return err; } -void -fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred) -{ - - struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct vattr va; - - if ((fvdat->flag & FN_SIZECHANGE) != 0 || - fuse_data_cache_mode == FUSE_CACHE_UC || - (fuse_refresh_size == 0 && fvdat->filesize != 0)) - return; - - VOP_GETATTR(vp, &va, cred); - SDT_PROBE2(fuse, , node, trace, 1, "refreshed file size"); -} - +/* + * Adjust the vnode's size to a new value, such as that provided by + * FUSE_GETATTR. + */ int fuse_vnode_setsize(struct vnode *vp, off_t newsize) { struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct vattr *attrs; off_t oldsize; + size_t iosize; + struct buf *bp = NULL; int err = 0; ASSERT_VOP_ELOCKED(vp, "fuse_vnode_setsize"); - oldsize = fvdat->filesize; - fvdat->filesize = newsize; - fvdat->flag |= FN_SIZECHANGE; + iosize = fuse_iosize(vp); + oldsize = fvdat->cached_attrs.va_size; + fvdat->cached_attrs.va_size = newsize; + if ((attrs = VTOVA(vp)) != NULL) + attrs->va_size = newsize; if (newsize < oldsize) { + daddr_t lbn; + err = vtruncbuf(vp, newsize, fuse_iosize(vp)); + if (err) + goto out; + if (newsize % iosize == 0) + goto out; + /* + * Zero the contents of the last partial block. + * Sure seems like vtruncbuf should do this for us. + */ + + lbn = newsize / iosize; + bp = getblk(vp, lbn, iosize, PCATCH, 0, 0); + if (!bp) { + err = EINTR; + goto out; + } + if (!(bp->b_flags & B_CACHE)) + goto out; /* Nothing to do */ + MPASS(bp->b_flags & B_VMIO); + vfs_bio_clrbuf(bp); + bp->b_dirtyend = MIN(bp->b_dirtyend, newsize - lbn * iosize); } +out: + if (bp) + brelse(bp); vnode_pager_setsize(vp, newsize); return err; +} + +/* Get the current, possibly dirty, size of the file */ +int +fuse_vnode_size(struct vnode *vp, off_t *filesize, struct ucred *cred, + struct thread *td) +{ + struct fuse_vnode_data *fvdat = VTOFUD(vp); + int error = 0; + + if (!(fvdat->flag & FN_SIZECHANGE) && + (VTOVA(vp) == NULL || fvdat->cached_attrs.va_size == VNOVAL)) + error = fuse_internal_do_getattr(vp, NULL, cred, td); + + if (!error) + *filesize = fvdat->cached_attrs.va_size; + + return error; +} + +void +fuse_vnode_undirty_cached_timestamps(struct vnode *vp) +{ + struct fuse_vnode_data *fvdat = VTOFUD(vp); + + fvdat->flag &= ~(FN_MTIMECHANGE | FN_CTIMECHANGE); +} + +/* Update a fuse file's cached timestamps */ +void +fuse_vnode_update(struct vnode *vp, int flags) +{ + struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); + struct timespec ts; + + vfs_timestamp(&ts); + + if (data->time_gran > 1) + ts.tv_nsec = rounddown(ts.tv_nsec, data->time_gran); + + if (flags & FN_MTIMECHANGE) + fvdat->cached_attrs.va_mtime = ts; + if (flags & FN_CTIMECHANGE) + fvdat->cached_attrs.va_ctime = ts; + + fvdat->flag |= flags; +} + +void +fuse_node_init(void) +{ + fuse_node_count = counter_u64_alloc(M_WAITOK); +} + +void +fuse_node_destroy(void) +{ + counter_u64_free(fuse_node_count); } Index: sys/fs/fuse/fuse_param.h =================================================================== --- sys/fs/fuse/fuse_param.h +++ /dev/null @@ -1,82 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-3-Clause - * - * Copyright (c) 2007-2009 Google Inc. and Amit Singh - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Google Inc. nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _FUSE_PARAM_H_ -#define _FUSE_PARAM_H_ - -/* - * This is the prefix ("fuse" by default) of the name of a FUSE device node - * in devfs. The suffix is the device number. "/dev/fuse0" is the first FUSE - * device by default. If you change the prefix from the default to something - * else, the user-space FUSE library will need to know about it too. - */ -#define FUSE_DEVICE_BASENAME "fuse" - -/* - * This is the number of /dev/fuse nodes we will create. goes from - * 0 to (FUSE_NDEVICES - 1). - */ -#define FUSE_NDEVICES 16 - -/* - * This is the default block size of the virtual storage devices that are - * implicitly implemented by the FUSE kernel extension. This can be changed - * on a per-mount basis (there's one such virtual device for each mount). - */ -#define FUSE_DEFAULT_BLOCKSIZE 4096 - -/* - * This is default I/O size used while accessing the virtual storage devices. - * This can be changed on a per-mount basis. - */ -#define FUSE_DEFAULT_IOSIZE 4096 - -#ifdef KERNEL - -/* - * This is the soft upper limit on the number of "request tickets" FUSE's - * user-kernel IPC layer can have for a given mount. This can be modified - * through the fuse.* sysctl interface. - */ -#define FUSE_DEFAULT_MAX_FREE_TICKETS 1024 - -#define FUSE_DEFAULT_IOV_PERMANENT_BUFSIZE (1L << 19) -#define FUSE_DEFAULT_IOV_CREDIT 16 - -#endif - -#define FUSE_LINK_MAX UINT32_MAX - -#endif /* _FUSE_PARAM_H_ */ Index: sys/fs/fuse/fuse_vfsops.c =================================================================== --- sys/fs/fuse/fuse_vfsops.c +++ sys/fs/fuse/fuse_vfsops.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -81,7 +86,6 @@ #include #include "fuse.h" -#include "fuse_param.h" #include "fuse_node.h" #include "fuse_ipc.h" #include "fuse_internal.h" @@ -89,13 +93,13 @@ #include #include -SDT_PROVIDER_DECLARE(fuse); +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , vfsops, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , vfsops, trace, "int", "char*"); /* This will do for privilege types for now */ #ifndef PRIV_VFS_FUSE_ALLOWOTHER @@ -108,30 +112,28 @@ #define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER #endif +static vfs_fhtovp_t fuse_vfsop_fhtovp; static vfs_mount_t fuse_vfsop_mount; static vfs_unmount_t fuse_vfsop_unmount; static vfs_root_t fuse_vfsop_root; static vfs_statfs_t fuse_vfsop_statfs; +static vfs_vget_t fuse_vfsop_vget; struct vfsops fuse_vfsops = { + .vfs_fhtovp = fuse_vfsop_fhtovp, .vfs_mount = fuse_vfsop_mount, .vfs_unmount = fuse_vfsop_unmount, .vfs_root = fuse_vfsop_root, .vfs_statfs = fuse_vfsop_statfs, + .vfs_vget = fuse_vfsop_vget, }; -SYSCTL_INT(_vfs_fusefs, OID_AUTO, init_backgrounded, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, 1, "indicate async handshake"); static int fuse_enforce_dev_perms = 0; SYSCTL_INT(_vfs_fusefs, OID_AUTO, enforce_dev_perms, CTLFLAG_RW, &fuse_enforce_dev_perms, 0, "enforce fuse device permissions for secondary mounts"); -static unsigned sync_unmount = 1; -SYSCTL_UINT(_vfs_fusefs, OID_AUTO, sync_unmount, CTLFLAG_RW, - &sync_unmount, 0, "specify when to use synchronous unmount"); - MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer"); static int @@ -208,11 +210,90 @@ vfs_flagopt(opts, "__" #fnam, &__mntopts, fval); \ } while (0) -SDT_PROBE_DEFINE1(fuse, , vfsops, mntopts, "uint64_t"); -SDT_PROBE_DEFINE4(fuse, , vfsops, mount_err, "char*", "struct fuse_data*", +SDT_PROBE_DEFINE1(fusefs, , vfsops, mntopts, "uint64_t"); +SDT_PROBE_DEFINE4(fusefs, , vfsops, mount_err, "char*", "struct fuse_data*", "struct mount*", "int"); static int +fuse_vfs_remount(struct mount *mp, struct thread *td, uint64_t mntopts, + uint32_t max_read, int daemon_timeout) +{ + int err = 0; + struct fuse_data *data = fuse_get_mpdata(mp); + /* Don't allow these options to be changed */ + const static unsigned long long cant_update_opts = + MNT_USER; /* Mount owner must be the user running the daemon */ + + FUSE_LOCK(); + + if ((mp->mnt_flag ^ data->mnt_flag) & cant_update_opts) { + err = EOPNOTSUPP; + SDT_PROBE4(fusefs, , vfsops, mount_err, + "Can't change these mount options during remount", + data, mp, err); + goto out; + } + if (((data->dataflags ^ mntopts) & FSESS_MNTOPTS_MASK) || + (data->max_read != max_read) || + (data->daemon_timeout != daemon_timeout)) { + // TODO: allow changing options where it makes sense + err = EOPNOTSUPP; + SDT_PROBE4(fusefs, , vfsops, mount_err, + "Can't change fuse mount options during remount", + data, mp, err); + goto out; + } + + if (fdata_get_dead(data)) { + err = ENOTCONN; + SDT_PROBE4(fusefs, , vfsops, mount_err, + "device is dead during mount", data, mp, err); + goto out; + } + + /* Sanity + permission checks */ + if (!data->daemoncred) + panic("fuse daemon found, but identity unknown"); + if (mntopts & FSESS_DAEMON_CAN_SPY) + err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER); + if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid) + /* are we allowed to do the first mount? */ + err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER); + +out: + FUSE_UNLOCK(); + return err; +} + +static int +fuse_vfsop_fhtovp(struct mount *mp, struct fid *fhp, int flags, + struct vnode **vpp) +{ + struct fuse_fid *ffhp = (struct fuse_fid *)fhp; + struct fuse_vnode_data *fvdat; + struct vnode *nvp; + int error; + + if (!(fuse_get_mpdata(mp)->dataflags & FSESS_EXPORT_SUPPORT)) + return EOPNOTSUPP; + + error = VFS_VGET(mp, ffhp->nid, LK_EXCLUSIVE, &nvp); + if (error) { + *vpp = NULLVP; + return (error); + } + fvdat = VTOFUD(nvp); + if (fvdat->generation != ffhp->gen ) { + vput(nvp); + *vpp = NULLVP; + return (ESTALE); + } + *vpp = nvp; + vnode_create_vobject(*vpp, 0, curthread); + return (0); +} + +static int fuse_vfsop_mount(struct mount *mp) { int err; @@ -238,13 +319,6 @@ __mntopts = 0; td = curthread; - if (mp->mnt_flag & MNT_UPDATE) - return EOPNOTSUPP; - - MNT_ILOCK(mp); - mp->mnt_flag |= MNT_SYNCHRONOUS; - mp->mnt_data = NULL; - MNT_IUNLOCK(mp); /* Get the new options passed to mount */ opts = mp->mnt_optnew; @@ -255,19 +329,6 @@ if (!vfs_getopts(opts, "fspath", &err)) return err; - /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */ - fspec = vfs_getopts(opts, "from", &err); - if (!fspec) - return err; - - /* `fd' contains the filedescriptor for this session; REQUIRED */ - if (vfs_scanopt(opts, "fd", "%d", &fd) != 1) - return EINVAL; - - err = fuse_getdevice(fspec, td, &fdev); - if (err != 0) - return err; - /* * With the help of underscored options the mount program * can inform us from the flags it sets by default @@ -275,12 +336,7 @@ FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY); FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN); FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS); - FUSE_FLAGOPT(no_attrcache, FSESS_NO_ATTRCACHE); - FUSE_FLAGOPT(no_readahed, FSESS_NO_READAHEAD); - FUSE_FLAGOPT(no_datacache, FSESS_NO_DATACACHE); - FUSE_FLAGOPT(no_namecache, FSESS_NO_NAMECACHE); - FUSE_FLAGOPT(no_mmap, FSESS_NO_MMAP); - FUSE_FLAGOPT(brokenio, FSESS_BROKENIO); + FUSE_FLAGOPT(intr, FSESS_INTR); (void)vfs_scanopt(opts, "max_read=", "%u", &max_read); if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) { @@ -293,11 +349,29 @@ } subtype = vfs_getopts(opts, "subtype=", &err); - SDT_PROBE1(fuse, , vfsops, mntopts, mntopts); + SDT_PROBE1(fusefs, , vfsops, mntopts, mntopts); + if (mp->mnt_flag & MNT_UPDATE) { + return fuse_vfs_remount(mp, td, mntopts, max_read, + daemon_timeout); + } + + /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */ + fspec = vfs_getopts(opts, "from", &err); + if (!fspec) + return err; + + /* `fd' contains the filedescriptor for this session; REQUIRED */ + if (vfs_scanopt(opts, "fd", "%d", &fd) != 1) + return EINVAL; + + err = fuse_getdevice(fspec, td, &fdev); + if (err != 0) + return err; + err = fget(td, fd, &cap_read_rights, &fp); if (err != 0) { - SDT_PROBE2(fuse, , vfsops, trace, 1, + SDT_PROBE2(fusefs, , vfsops, trace, 1, "invalid or not opened device"); goto out; } @@ -307,16 +381,17 @@ td->td_fpop = fptmp; fdrop(fp, td); FUSE_LOCK(); - if (err != 0 || data == NULL || data->mp != NULL) { + + if (err != 0 || data == NULL) { err = ENXIO; - SDT_PROBE4(fuse, , vfsops, mount_err, + SDT_PROBE4(fusefs, , vfsops, mount_err, "invalid or not opened device", data, mp, err); FUSE_UNLOCK(); goto out; } if (fdata_get_dead(data)) { err = ENOTCONN; - SDT_PROBE4(fuse, , vfsops, mount_err, + SDT_PROBE4(fusefs, , vfsops, mount_err, "device is dead during mount", data, mp, err); FUSE_UNLOCK(); goto out; @@ -338,12 +413,17 @@ data->dataflags |= mntopts; data->max_read = max_read; data->daemon_timeout = daemon_timeout; + data->mnt_flag = mp->mnt_flag & MNT_UPDATEMASK; FUSE_UNLOCK(); vfs_getnewfsid(mp); MNT_ILOCK(mp); mp->mnt_data = data; - mp->mnt_flag |= MNT_LOCAL; + /* + * FUSE file systems can be either local or remote, but the kernel + * can't tell the difference. + */ + mp->mnt_flag &= ~MNT_LOCAL; mp->mnt_kern_flag |= MNTK_USES_BCACHE; MNT_IUNLOCK(mp); /* We need this here as this slot is used by getnewvnode() */ @@ -354,6 +434,7 @@ } copystr(fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &len); bzero(mp->mnt_stat.f_mntfromname + len, MNAMELEN - len); + mp->mnt_iosize_max = MAXPHYS; /* Now handshaking with daemon */ fuse_internal_send_init(data, td); @@ -366,9 +447,10 @@ * Destroy device only if we acquired reference to * it */ - SDT_PROBE4(fuse, , vfsops, mount_err, + SDT_PROBE4(fusefs, , vfsops, mount_err, "mount failed, destroy device", data, mp, err); data->mp = NULL; + mp->mnt_data = NULL; fdata_trydestroy(data); } FUSE_UNLOCK(); @@ -412,11 +494,13 @@ if (fdata_get_dead(data)) { goto alreadydead; } - fdisp_init(&fdi, 0); - fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL); + if (fsess_isimpl(mp, FUSE_DESTROY)) { + fdisp_init(&fdi, 0); + fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL); - err = fdisp_wait_answ(&fdi); - fdisp_destroy(&fdi); + (void)fdisp_wait_answ(&fdi); + fdisp_destroy(&fdi); + } fdata_set_dead(data); @@ -429,7 +513,6 @@ MNT_ILOCK(mp); mp->mnt_data = NULL; - mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); dev_rel(fdev); @@ -437,7 +520,87 @@ return 0; } +SDT_PROBE_DEFINE1(fusefs, , vfsops, invalidate_without_export, + "struct mount*"); static int +fuse_vfsop_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) +{ + struct fuse_data *data = fuse_get_mpdata(mp); + uint64_t nodeid = ino; + struct thread *td = curthread; + struct fuse_dispatcher fdi; + struct fuse_entry_out *feo; + struct fuse_vnode_data *fvdat; + const char dot[] = "."; + off_t filesize; + enum vtype vtyp; + int error; + + if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) { + /* + * Unreachable unless you do something stupid, like export a + * nullfs mount of a fusefs file system. + */ + SDT_PROBE1(fusefs, , vfsops, invalidate_without_export, mp); + return (EOPNOTSUPP); + } + + error = fuse_internal_get_cached_vnode(mp, ino, flags, vpp); + if (error || *vpp != NULL) + return error; + + /* Do a LOOKUP, using nodeid as the parent and "." as filename */ + fdisp_init(&fdi, sizeof(dot)); + fdisp_make(&fdi, FUSE_LOOKUP, mp, nodeid, td, td->td_ucred); + memcpy(fdi.indata, dot, sizeof(dot)); + error = fdisp_wait_answ(&fdi); + + if (error) + return error; + + feo = (struct fuse_entry_out *)fdi.answ; + if (feo->nodeid == 0) { + /* zero nodeid means ENOENT and cache it */ + error = ENOENT; + goto out; + } + + vtyp = IFTOVT(feo->attr.mode); + error = fuse_vnode_get(mp, feo, nodeid, NULL, vpp, NULL, vtyp); + if (error) + goto out; + filesize = feo->attr.size; + + /* + * In the case where we are looking up a FUSE node represented by an + * existing cached vnode, and the true size reported by FUSE_LOOKUP + * doesn't match the vnode's cached size, then any cached writes beyond + * the file's current size are lost. + * + * We can get here: + * * following attribute cache expiration, or + * * due a bug in the daemon, or + */ + fvdat = VTOFUD(*vpp); + if (vnode_isreg(*vpp) && + filesize != fvdat->cached_attrs.va_size && + fvdat->flag & FN_SIZECHANGE) { + printf("%s: WB cache incoherent on %s!\n", __func__, + vnode_mount(*vpp)->mnt_stat.f_mntonname); + + fvdat->flag &= ~FN_SIZECHANGE; + } + + fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, + feo->attr_valid_nsec, NULL); + fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec, + &fvdat->entry_cache_timeout); +out: + fdisp_destroy(&fdi); + return error; +} + +static int fuse_vfsop_root(struct mount *mp, int lkflags, struct vnode **vpp) { struct fuse_data *data = fuse_get_mpdata(mp); @@ -454,13 +617,13 @@ FUSE_LOCK(); MPASS(data->vroot == NULL || data->vroot == *vpp); if (data->vroot == NULL) { - SDT_PROBE2(fuse, , vfsops, trace, 1, + SDT_PROBE2(fusefs, , vfsops, trace, 1, "new root vnode"); data->vroot = *vpp; FUSE_UNLOCK(); vref(*vpp); } else if (data->vroot != *vpp) { - SDT_PROBE2(fuse, , vfsops, trace, 1, + SDT_PROBE2(fusefs, , vfsops, trace, 1, "root vnode race"); FUSE_UNLOCK(); VOP_UNLOCK(*vpp, 0); @@ -523,7 +686,7 @@ sbp->f_files = 0; sbp->f_ffree = 0; sbp->f_namemax = 0; - sbp->f_bsize = FUSE_DEFAULT_BLOCKSIZE; + sbp->f_bsize = S_BLKSIZE; return 0; } Index: sys/fs/fuse/fuse_vnops.c =================================================================== --- sys/fs/fuse/fuse_vnops.c +++ sys/fs/fuse/fuse_vnops.c @@ -33,6 +33,11 @@ * Copyright (C) 2005 Csaba Henk. * All rights reserved. * + * Copyright (c) 2019 The FreeBSD Foundation + * + * Portions of this software were developed by BFF Storage Systems, LLC under + * sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -103,24 +108,30 @@ #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" -#include "fuse_param.h" #include "fuse_io.h" #include -SDT_PROVIDER_DECLARE(fuse); +/* Maximum number of hardlinks to a single FUSE file */ +#define FUSE_LINK_MAX UINT32_MAX + +SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ -SDT_PROBE_DEFINE2(fuse, , vnops, trace, "int", "char*"); +SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*"); /* vnode ops */ static vop_access_t fuse_vnop_access; +static vop_advlock_t fuse_vnop_advlock; +static vop_bmap_t fuse_vnop_bmap; +static vop_close_t fuse_fifo_close; static vop_close_t fuse_vnop_close; static vop_create_t fuse_vnop_create; static vop_deleteextattr_t fuse_vnop_deleteextattr; +static vop_fdatasync_t fuse_vnop_fdatasync; static vop_fsync_t fuse_vnop_fsync; static vop_getattr_t fuse_vnop_getattr; static vop_getextattr_t fuse_vnop_getextattr; @@ -145,19 +156,44 @@ static vop_symlink_t fuse_vnop_symlink; static vop_write_t fuse_vnop_write; static vop_getpages_t fuse_vnop_getpages; -static vop_putpages_t fuse_vnop_putpages; static vop_print_t fuse_vnop_print; +static vop_vptofh_t fuse_vnop_vptofh; +struct vop_vector fuse_fifoops = { + .vop_default = &fifo_specops, + .vop_access = fuse_vnop_access, + .vop_close = fuse_fifo_close, + .vop_fsync = fuse_vnop_fsync, + .vop_getattr = fuse_vnop_getattr, + .vop_inactive = fuse_vnop_inactive, + .vop_pathconf = fuse_vnop_pathconf, + .vop_print = fuse_vnop_print, + .vop_read = VOP_PANIC, + .vop_reclaim = fuse_vnop_reclaim, + .vop_setattr = fuse_vnop_setattr, + .vop_write = VOP_PANIC, + .vop_vptofh = fuse_vnop_vptofh, +}; + struct vop_vector fuse_vnops = { + .vop_allocate = VOP_EINVAL, .vop_default = &default_vnodeops, .vop_access = fuse_vnop_access, + .vop_advlock = fuse_vnop_advlock, + .vop_bmap = fuse_vnop_bmap, .vop_close = fuse_vnop_close, .vop_create = fuse_vnop_create, .vop_deleteextattr = fuse_vnop_deleteextattr, .vop_fsync = fuse_vnop_fsync, + .vop_fdatasync = fuse_vnop_fdatasync, .vop_getattr = fuse_vnop_getattr, .vop_getextattr = fuse_vnop_getextattr, .vop_inactive = fuse_vnop_inactive, + /* + * TODO: implement vop_ioctl after upgrading to protocol 7.16. + * FUSE_IOCTL was added in 7.11, but 32-bit compat is broken until + * 7.16. + */ .vop_link = fuse_vnop_link, .vop_listextattr = fuse_vnop_listextattr, .vop_lookup = fuse_vnop_lookup, @@ -165,6 +201,12 @@ .vop_mknod = fuse_vnop_mknod, .vop_open = fuse_vnop_open, .vop_pathconf = fuse_vnop_pathconf, + /* + * TODO: implement vop_poll after upgrading to protocol 7.21. + * FUSE_POLL was added in protocol 7.11, but it's kind of broken until + * 7.21, which adds the ability for the client to choose which poll + * events it wants, and for a client to deregister a file handle + */ .vop_read = fuse_vnop_read, .vop_readdir = fuse_vnop_readdir, .vop_readlink = fuse_vnop_readlink, @@ -178,41 +220,103 @@ .vop_symlink = fuse_vnop_symlink, .vop_write = fuse_vnop_write, .vop_getpages = fuse_vnop_getpages, - .vop_putpages = fuse_vnop_putpages, .vop_print = fuse_vnop_print, + .vop_vptofh = fuse_vnop_vptofh, }; -static u_long fuse_lookup_cache_hits = 0; +uma_zone_t fuse_pbuf_zone; -SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_hits, CTLFLAG_RD, - &fuse_lookup_cache_hits, 0, "number of positive cache hits in lookup"); +#define fuse_vm_page_lock(m) vm_page_lock((m)); +#define fuse_vm_page_unlock(m) vm_page_unlock((m)); +#define fuse_vm_page_lock_queues() ((void)0) +#define fuse_vm_page_unlock_queues() ((void)0) -static u_long fuse_lookup_cache_misses = 0; +/* Check permission for extattr operations, much like extattr_check_cred */ +static int +fuse_extattr_check_cred(struct vnode *vp, int ns, struct ucred *cred, + struct thread *td, accmode_t accmode) +{ + struct mount *mp = vnode_mount(vp); + struct fuse_data *data = fuse_get_mpdata(mp); -SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_misses, CTLFLAG_RD, - &fuse_lookup_cache_misses, 0, "number of cache misses in lookup"); + /* + * Kernel-invoked always succeeds. + */ + if (cred == NOCRED) + return (0); -int fuse_lookup_cache_enable = 1; + /* + * Do not allow privileged processes in jail to directly manipulate + * system attributes. + */ + switch (ns) { + case EXTATTR_NAMESPACE_SYSTEM: + if (data->dataflags & FSESS_DEFAULT_PERMISSIONS) { + return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); + } + /* FALLTHROUGH */ + case EXTATTR_NAMESPACE_USER: + return (fuse_internal_access(vp, accmode, td, cred)); + default: + return (EPERM); + } +} -SYSCTL_INT(_vfs_fusefs, OID_AUTO, lookup_cache_enable, CTLFLAG_RW, - &fuse_lookup_cache_enable, 0, "if non-zero, enable lookup cache"); +/* Get a filehandle for a directory */ +static int +fuse_filehandle_get_dir(struct vnode *vp, struct fuse_filehandle **fufhp, + struct ucred *cred, pid_t pid) +{ + if (fuse_filehandle_get(vp, FREAD, fufhp, cred, pid) == 0) + return 0; + return fuse_filehandle_get(vp, FEXEC, fufhp, cred, pid); +} -/* - * XXX: This feature is highly experimental and can bring to instabilities, - * needs revisiting before to be enabled by default. - */ -static int fuse_reclaim_revoked = 0; +/* Send FUSE_FLUSH for this vnode */ +static int +fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) +{ + struct fuse_flush_in *ffi; + struct fuse_filehandle *fufh; + struct fuse_dispatcher fdi; + struct thread *td = curthread; + struct mount *mp = vnode_mount(vp); + int err; -SYSCTL_INT(_vfs_fusefs, OID_AUTO, reclaim_revoked, CTLFLAG_RW, - &fuse_reclaim_revoked, 0, ""); + if (!fsess_isimpl(vnode_mount(vp), FUSE_FLUSH)) + return 0; -uma_zone_t fuse_pbuf_zone; + err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); + if (err) + return err; -#define fuse_vm_page_lock(m) vm_page_lock((m)); -#define fuse_vm_page_unlock(m) vm_page_unlock((m)); -#define fuse_vm_page_lock_queues() ((void)0) -#define fuse_vm_page_unlock_queues() ((void)0) + fdisp_init(&fdi, sizeof(*ffi)); + fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred); + ffi = fdi.indata; + ffi->fh = fufh->fh_id; + /* + * If the file has a POSIX lock then we're supposed to set lock_owner. + * If not, then lock_owner is undefined. So we may as well always set + * it. + */ + ffi->lock_owner = td->td_proc->p_pid; + err = fdisp_wait_answ(&fdi); + if (err == ENOSYS) { + fsess_set_notimpl(mp, FUSE_FLUSH); + err = 0; + } + fdisp_destroy(&fdi); + return err; +} + +/* Close wrapper for fifos. */ +static int +fuse_fifo_close(struct vop_close_args *ap) +{ + return (fifo_specops.vop_close(ap)); +} + /* struct vnop_access_args { struct vnode *a_vp; @@ -232,7 +336,6 @@ int accmode = ap->a_accmode; struct ucred *cred = ap->a_cred; - struct fuse_access_param facp; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); int err; @@ -255,15 +358,192 @@ if (vnode_islnk(vp)) { return 0; } - bzero(&facp, sizeof(facp)); - err = fuse_internal_access(vp, accmode, &facp, ap->a_td, ap->a_cred); + err = fuse_internal_access(vp, accmode, ap->a_td, ap->a_cred); return err; } /* - struct vnop_close_args { + * struct vop_advlock_args { + * struct vop_generic_args a_gen; + * struct vnode *a_vp; + * void *a_id; + * int a_op; + * struct flock *a_fl; + * int a_flags; + * } + */ +static int +fuse_vnop_advlock(struct vop_advlock_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct flock *fl = ap->a_fl; + struct thread *td = curthread; + struct ucred *cred = td->td_ucred; + pid_t pid = td->td_proc->p_pid; + struct fuse_filehandle *fufh; + struct fuse_dispatcher fdi; + struct fuse_lk_in *fli; + struct fuse_lk_out *flo; + enum fuse_opcode op; + int dataflags, err; + int flags = ap->a_flags; + + dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; + + if (fuse_isdeadfs(vp)) { + return ENXIO; + } + + if (!(dataflags & FSESS_POSIX_LOCKS)) + return vop_stdadvlock(ap); + /* FUSE doesn't properly support flock until protocol 7.17 */ + if (flags & F_FLOCK) + return vop_stdadvlock(ap); + + err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid); + if (err) + return err; + + fdisp_init(&fdi, sizeof(*fli)); + + switch(ap->a_op) { + case F_GETLK: + op = FUSE_GETLK; + break; + case F_SETLK: + op = FUSE_SETLK; + break; + case F_SETLKW: + op = FUSE_SETLKW; + break; + default: + return EINVAL; + } + + fdisp_make_vp(&fdi, op, vp, td, cred); + fli = fdi.indata; + fli->fh = fufh->fh_id; + fli->owner = fl->l_pid; + fli->lk.start = fl->l_start; + if (fl->l_len != 0) + fli->lk.end = fl->l_start + fl->l_len - 1; + else + fli->lk.end = INT64_MAX; + fli->lk.type = fl->l_type; + fli->lk.pid = fl->l_pid; + + err = fdisp_wait_answ(&fdi); + fdisp_destroy(&fdi); + + if (err == 0 && op == FUSE_GETLK) { + flo = fdi.answ; + fl->l_type = flo->lk.type; + fl->l_pid = flo->lk.pid; + if (flo->lk.type != F_UNLCK) { + fl->l_start = flo->lk.start; + if (flo->lk.end == INT64_MAX) + fl->l_len = 0; + else + fl->l_len = flo->lk.end - flo->lk.start + 1; + fl->l_start = flo->lk.start; + } + } + + return err; +} + +/* { struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; +} */ +static int +fuse_vnop_bmap(struct vop_bmap_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct bufobj **bo = ap->a_bop; + struct thread *td = curthread; + struct mount *mp; + struct fuse_dispatcher fdi; + struct fuse_bmap_in *fbi; + struct fuse_bmap_out *fbo; + struct fuse_data *data; + uint64_t biosize; + off_t filesize; + daddr_t lbn = ap->a_bn; + daddr_t *pbn = ap->a_bnp; + int *runp = ap->a_runp; + int *runb = ap->a_runb; + int error = 0; + int maxrun; + + if (fuse_isdeadfs(vp)) { + return ENXIO; + } + + mp = vnode_mount(vp); + data = fuse_get_mpdata(mp); + biosize = fuse_iosize(vp); + maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1, + data->max_readahead_blocks); + + if (bo != NULL) + *bo = &vp->v_bufobj; + + /* + * The FUSE_BMAP operation does not include the runp and runb + * variables, so we must guess. Report nonzero contiguous runs so + * cluster_read will combine adjacent reads. It's worthwhile to reduce + * upcalls even if we don't know the true physical layout of the file. + * + * FUSE file systems may opt out of read clustering in two ways: + * * mounting with -onoclusterr + * * Setting max_readahead <= maxbcachebuf during FUSE_INIT + */ + if (runb != NULL) + *runb = MIN(lbn, maxrun); + if (runp != NULL) { + error = fuse_vnode_size(vp, &filesize, td->td_ucred, td); + if (error == 0) + *runp = MIN(MAX(0, filesize / biosize - lbn - 1), + maxrun); + else + *runp = 0; + } + + if (fsess_isimpl(mp, FUSE_BMAP)) { + fdisp_init(&fdi, sizeof(*fbi)); + fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred); + fbi = fdi.indata; + fbi->block = lbn; + fbi->blocksize = biosize; + error = fdisp_wait_answ(&fdi); + if (error == ENOSYS) { + fdisp_destroy(&fdi); + fsess_set_notimpl(mp, FUSE_BMAP); + error = 0; + } else { + fbo = fdi.answ; + if (error == 0 && pbn != NULL) + *pbn = fbo->block; + fdisp_destroy(&fdi); + return error; + } + } + + /* If the daemon doesn't support BMAP, make up a sensible default */ + if (pbn != NULL) + *pbn = lbn * btodb(biosize); + return (error); +} + +/* + struct vop_close_args { + struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; @@ -275,39 +555,48 @@ struct vnode *vp = ap->a_vp; struct ucred *cred = ap->a_cred; int fflag = ap->a_fflag; - fufh_type_t fufh_type; + struct thread *td = ap->a_td; + pid_t pid = td->td_proc->p_pid; + int err = 0; - if (fuse_isdeadfs(vp)) { + if (fuse_isdeadfs(vp)) return 0; - } - if (vnode_isdir(vp)) { - if (fuse_filehandle_valid(vp, FUFH_RDONLY)) { - fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); - } + if (vnode_isdir(vp)) return 0; - } - if (fflag & IO_NDELAY) { + if (fflag & IO_NDELAY) return 0; - } - fufh_type = fuse_filehandle_xlate_from_fflags(fflag); - if (!fuse_filehandle_valid(vp, fufh_type)) { - int i; - - for (i = 0; i < FUFH_MAXTYPE; i++) - if (fuse_filehandle_valid(vp, i)) - break; - if (i == FUFH_MAXTYPE) - panic("FUSE: fufh type %d found to be invalid in close" - " (fflag=0x%x)\n", - fufh_type, fflag); - } + err = fuse_flush(vp, cred, pid, fflag); + /* TODO: close the file handle, if we're sure it's no longer used */ if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { - fuse_vnode_savesize(vp, cred); + fuse_vnode_savesize(vp, cred, td->td_proc->p_pid); } - return 0; + return err; } +static void +fdisp_make_mknod_for_fallback( + struct fuse_dispatcher *fdip, + struct componentname *cnp, + struct vnode *dvp, + uint64_t parentnid, + struct thread *td, + struct ucred *cred, + mode_t mode, + enum fuse_opcode *op) +{ + struct fuse_mknod_in *fmni; + + fdisp_init(fdip, sizeof(*fmni) + cnp->cn_namelen + 1); + *op = FUSE_MKNOD; + fdisp_make(fdip, *op, vnode_mount(dvp), parentnid, td, cred); + fmni = fdip->indata; + fmni->mode = mode; + fmni->rdev = 0; + memcpy((char *)fdip->indata + sizeof(*fmni), cnp->cn_nameptr, + cnp->cn_namelen); + ((char *)fdip->indata)[sizeof(*fmni) + cnp->cn_namelen] = '\0'; +} /* struct vnop_create_args { struct vnode *a_dvp; @@ -326,107 +615,169 @@ struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; - struct fuse_open_in *foi; + struct fuse_data *data; + struct fuse_create_in *fci; struct fuse_entry_out *feo; - struct fuse_dispatcher fdi; + struct fuse_open_out *foo; + struct fuse_dispatcher fdi, fdi2; struct fuse_dispatcher *fdip = &fdi; + struct fuse_dispatcher *fdip2 = NULL; int err; struct mount *mp = vnode_mount(dvp); + data = fuse_get_mpdata(mp); uint64_t parentnid = VTOFUD(dvp)->nid; mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode); - uint64_t x_fh_id; - uint32_t x_open_flags; + enum fuse_opcode op; + int flags; - if (fuse_isdeadfs(dvp)) { + if (fuse_isdeadfs(dvp)) return ENXIO; - } + + /* FUSE expects sockets to be created with FUSE_MKNOD */ + if (vap->va_type == VSOCK) + return fuse_internal_mknod(dvp, vpp, cnp, vap); + + /* + * VOP_CREATE doesn't tell us the open(2) flags, so we guess. Only a + * writable mode makes sense, and we might as well include readability + * too. + */ + flags = O_RDWR; + bzero(&fdi, sizeof(fdi)); - /* XXX: Will we ever want devices ? */ - if ((vap->va_type != VREG)) { - printf("fuse_vnop_create: unsupported va_type %d\n", - vap->va_type); + if (vap->va_type != VREG) return (EINVAL); - } - fdisp_init(fdip, sizeof(*foi) + cnp->cn_namelen + 1); - if (!fsess_isimpl(mp, FUSE_CREATE)) { - SDT_PROBE2(fuse, , vnops, trace, 1, - "eh, daemon doesn't implement create?"); - return (EINVAL); - } - fdisp_make(fdip, FUSE_CREATE, vnode_mount(dvp), parentnid, td, cred); + if (!fsess_isimpl(mp, FUSE_CREATE) || vap->va_type == VSOCK) { + /* Fallback to FUSE_MKNOD/FUSE_OPEN */ + fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td, + cred, mode, &op); + } else { + /* Use FUSE_CREATE */ + size_t insize; - foi = fdip->indata; - foi->mode = mode; - foi->flags = O_CREAT | O_RDWR; + op = FUSE_CREATE; + fdisp_init(fdip, sizeof(*fci) + cnp->cn_namelen + 1); + fdisp_make(fdip, op, vnode_mount(dvp), parentnid, td, cred); + fci = fdip->indata; + fci->mode = mode; + fci->flags = O_CREAT | flags; + if (fuse_libabi_geq(data, 7, 12)) { + insize = sizeof(*fci); + fci->umask = td->td_proc->p_fd->fd_cmask; + } else { + insize = sizeof(struct fuse_open_in); + } - memcpy((char *)fdip->indata + sizeof(*foi), cnp->cn_nameptr, - cnp->cn_namelen); - ((char *)fdip->indata)[sizeof(*foi) + cnp->cn_namelen] = '\0'; + memcpy((char *)fdip->indata + insize, cnp->cn_nameptr, + cnp->cn_namelen); + ((char *)fdip->indata)[insize + cnp->cn_namelen] = '\0'; + } err = fdisp_wait_answ(fdip); if (err) { - if (err == ENOSYS) + if (err == ENOSYS && op == FUSE_CREATE) { fsess_set_notimpl(mp, FUSE_CREATE); - goto out; + fdisp_destroy(fdip); + fdisp_make_mknod_for_fallback(fdip, cnp, dvp, + parentnid, td, cred, mode, &op); + err = fdisp_wait_answ(fdip); + } + if (err) + goto out; } feo = fdip->answ; - if ((err = fuse_internal_checkentry(feo, VREG))) { + if ((err = fuse_internal_checkentry(feo, vap->va_type))) { goto out; } - err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, VREG); + + if (op == FUSE_CREATE) { + foo = (struct fuse_open_out*)(feo + 1); + } else { + /* Issue a separate FUSE_OPEN */ + struct fuse_open_in *foi; + + fdip2 = &fdi2; + fdisp_init(fdip2, sizeof(*foi)); + fdisp_make(fdip2, FUSE_OPEN, vnode_mount(dvp), feo->nodeid, td, + cred); + foi = fdip2->indata; + foi->flags = flags; + err = fdisp_wait_answ(fdip2); + if (err) + goto out; + foo = fdip2->answ; + } + err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vap->va_type); if (err) { struct fuse_release_in *fri; uint64_t nodeid = feo->nodeid; - uint64_t fh_id = ((struct fuse_open_out *)(feo + 1))->fh; + uint64_t fh_id = foo->fh; fdisp_init(fdip, sizeof(*fri)); fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred); fri = fdip->indata; fri->fh = fh_id; - fri->flags = OFLAGS(mode); + fri->flags = flags; fuse_insert_callback(fdip->tick, fuse_internal_forget_callback); - fuse_insert_message(fdip->tick); - return err; + fuse_insert_message(fdip->tick, false); + goto out; } ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create"); + fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, + feo->attr_valid_nsec, NULL); - fdip->answ = feo + 1; - - x_fh_id = ((struct fuse_open_out *)(feo + 1))->fh; - x_open_flags = ((struct fuse_open_out *)(feo + 1))->open_flags; - fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, x_fh_id); - fuse_vnode_open(*vpp, x_open_flags, td); + fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, td, cred, foo); + fuse_vnode_open(*vpp, foo->open_flags, td); + /* + * Purge the parent's attribute cache because the daemon should've + * updated its mtime and ctime + */ + fuse_vnode_clear_attr_cache(dvp); cache_purge_negative(dvp); out: + if (fdip2) + fdisp_destroy(fdip2); fdisp_destroy(fdip); return err; } /* - * Our vnop_fsync roughly corresponds to the FUSE_FSYNC method. The Linux - * version of FUSE also has a FUSE_FLUSH method. - * - * On Linux, fsync() synchronizes a file's complete in-core state with that - * on disk. The call is not supposed to return until the system has completed - * that action or until an error is detected. - * - * Linux also has an fdatasync() call that is similar to fsync() but is not - * required to update the metadata such as access time and modification time. - */ + struct vnop_fdatasync_args { + struct vop_generic_args a_gen; + struct vnode * a_vp; + struct thread * a_td; + }; +*/ +static int +fuse_vnop_fdatasync(struct vop_fdatasync_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct thread *td = ap->a_td; + int waitfor = MNT_WAIT; + int err = 0; + + if (fuse_isdeadfs(vp)) { + return 0; + } + if ((err = vop_stdfdatasync_buf(ap))) + return err; + + return fuse_internal_fsync(vp, td, waitfor, true); +} + /* struct vnop_fsync_args { - struct vnodeop_desc *a_desc; + struct vop_generic_args a_gen; struct vnode * a_vp; - struct ucred * a_cred; int a_waitfor; struct thread * a_td; }; @@ -436,31 +787,16 @@ { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; + int waitfor = ap->a_waitfor; + int err = 0; - struct fuse_filehandle *fufh; - struct fuse_vnode_data *fvdat = VTOFUD(vp); - - int type, err = 0; - if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfsync(ap))) return err; - if (!fsess_isimpl(vnode_mount(vp), - (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) { - goto out; - } - for (type = 0; type < FUFH_MAXTYPE; type++) { - fufh = &(fvdat->fufh[type]); - if (FUFH_IS_VALID(fufh)) { - fuse_internal_fsync(vp, td, NULL, fufh); - } - } - -out: - return 0; + return fuse_internal_fsync(vp, td, waitfor, false); } /* @@ -478,12 +814,9 @@ struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; - struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct fuse_attr_out *fao; int err = 0; int dataflags; - struct fuse_dispatcher fdi; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; @@ -498,48 +831,14 @@ goto fake; } } - fdisp_init(&fdi, 0); - if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) { - if ((err == ENOTCONN) && vnode_isvroot(vp)) { - /* see comment in fuse_vfsop_statfs() */ - fdisp_destroy(&fdi); - goto fake; - } - if (err == ENOENT) { - fuse_internal_vnode_disappear(vp); - } - goto out; + err = fuse_internal_getattr(vp, vap, cred, td); + if (err == ENOTCONN && vnode_isvroot(vp)) { + /* see comment in fuse_vfsop_statfs() */ + goto fake; + } else { + return err; } - fao = (struct fuse_attr_out *)fdi.answ; - fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, - fao->attr_valid_nsec, vap); - if (vap->va_type != vnode_vtype(vp)) { - fuse_internal_vnode_disappear(vp); - err = ENOENT; - goto out; - } - if ((fvdat->flag & FN_SIZECHANGE) != 0) - vap->va_size = fvdat->filesize; - - if (vnode_isreg(vp) && (fvdat->flag & FN_SIZECHANGE) == 0) { - /* - * This is for those cases when the file size changed without us - * knowing, and we want to catch up. - */ - off_t new_filesize = ((struct fuse_attr_out *) - fdi.answ)->attr.size; - - if (fvdat->filesize != new_filesize) { - fuse_vnode_setsize(vp, new_filesize); - fvdat->flag &= ~FN_SIZECHANGE; - } - } - -out: - fdisp_destroy(&fdi); - return err; - fake: bzero(vap, sizeof(*vap)); vap->va_type = vnode_vtype(vp); @@ -560,31 +859,27 @@ struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct fuse_filehandle *fufh = NULL; + struct fuse_filehandle *fufh, *fufh_tmp; - int type, need_flush = 1; + int need_flush = 1; - for (type = 0; type < FUFH_MAXTYPE; type++) { - fufh = &(fvdat->fufh[type]); - if (FUFH_IS_VALID(fufh)) { - if (need_flush && vp->v_type == VREG) { - if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { - fuse_vnode_savesize(vp, NULL); - } - if (fuse_data_cache_invalidate || - (fvdat->flag & FN_REVOKED) != 0) - fuse_io_invalbuf(vp, td); - else - fuse_io_flushbuf(vp, MNT_WAIT, td); - need_flush = 0; + LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { + if (need_flush && vp->v_type == VREG) { + if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { + fuse_vnode_savesize(vp, NULL, 0); } - fuse_filehandle_close(vp, type, td, NULL); + if ((fvdat->flag & FN_REVOKED) != 0) + fuse_io_invalbuf(vp, td); + else + fuse_io_flushbuf(vp, MNT_WAIT, td); + need_flush = 0; } + fuse_filehandle_close(vp, fufh, td, NULL); } - if ((fvdat->flag & FN_REVOKED) != 0 && fuse_reclaim_revoked) { + if ((fvdat->flag & FN_REVOKED) != 0) vrecycle(vp); - } + return 0; } @@ -636,11 +931,39 @@ feo = fdi.answ; err = fuse_internal_checkentry(feo, vnode_vtype(vp)); + if (!err) { + /* + * Purge the parent's attribute cache because the daemon + * should've updated its mtime and ctime + */ + fuse_vnode_clear_attr_cache(tdvp); + fuse_internal_cache_attrs(vp, &feo->attr, feo->attr_valid, + feo->attr_valid_nsec, NULL); + } out: fdisp_destroy(&fdi); return err; } +struct fuse_lookup_alloc_arg { + struct fuse_entry_out *feo; + struct componentname *cnp; + uint64_t nid; + enum vtype vtyp; +}; + +/* Callback for vn_get_ino */ +static int +fuse_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) +{ + struct fuse_lookup_alloc_arg *flaa = arg; + + return fuse_vnode_get(mp, flaa->feo, flaa->nid, NULL, vpp, flaa->cnp, + flaa->vtyp); +} + +SDT_PROBE_DEFINE3(fusefs, , vnops, cache_lookup, + "int", "struct timespec*", "struct timespec*"); /* struct vnop_lookup_args { struct vnodeop_desc *a_desc; @@ -669,268 +992,146 @@ struct vnode *vp = NULL; struct fuse_dispatcher fdi; - enum fuse_opcode op; + bool did_lookup = false; + struct fuse_entry_out *feo = NULL; + enum vtype vtyp; /* vnode type of target */ + off_t filesize; /* filesize of target */ uint64_t nid; - struct fuse_access_param facp; if (fuse_isdeadfs(dvp)) { *vpp = NULL; return ENXIO; } - if (!vnode_isdir(dvp)) { + if (!vnode_isdir(dvp)) return ENOTDIR; - } - if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) { + + if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) return EROFS; - } - /* - * We do access check prior to doing anything else only in the case - * when we are at fs root (we'd like to say, "we are at the first - * component", but that's not exactly the same... nevermind). - * See further comments at further access checks. - */ - bzero(&facp, sizeof(facp)); - if (vnode_isvroot(dvp)) { /* early permission check hack */ - if ((err = fuse_internal_access(dvp, VEXEC, &facp, td, cred))) { - return err; - } - } + if ((err = fuse_internal_access(dvp, VEXEC, td, cred))) + return err; + if (flags & ISDOTDOT) { + KASSERT(VTOFUD(dvp)->flag & FN_PARENT_NID, + ("Looking up .. is TODO")); nid = VTOFUD(dvp)->parent_nid; - if (nid == 0) { + if (nid == 0) return ENOENT; - } - fdisp_init(&fdi, 0); - op = FUSE_GETATTR; - goto calldaemon; + /* .. is obviously a directory */ + vtyp = VDIR; + filesize = 0; } else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') { nid = VTOI(dvp); - fdisp_init(&fdi, 0); - op = FUSE_GETATTR; - goto calldaemon; - } else if (fuse_lookup_cache_enable) { - err = cache_lookup(dvp, vpp, cnp, NULL, NULL); - switch (err) { + /* . is obviously a directory */ + vtyp = VDIR; + filesize = 0; + } else { + struct timespec now, timeout; + err = cache_lookup(dvp, vpp, cnp, &timeout, NULL); + getnanouptime(&now); + SDT_PROBE3(fusefs, , vnops, cache_lookup, err, &timeout, &now); + switch (err) { case -1: /* positive match */ - atomic_add_acq_long(&fuse_lookup_cache_hits, 1); + if (timespeccmp(&timeout, &now, >)) { + counter_u64_add(fuse_lookup_cache_hits, 1); + } else { + /* Cache timeout */ + counter_u64_add(fuse_lookup_cache_misses, 1); + bintime_clear( + &VTOFUD(*vpp)->entry_cache_timeout); + cache_purge(*vpp); + if (dvp != *vpp) + vput(*vpp); + else + vrele(*vpp); + *vpp = NULL; + break; + } return 0; case 0: /* no match in cache */ - atomic_add_acq_long(&fuse_lookup_cache_misses, 1); + counter_u64_add(fuse_lookup_cache_misses, 1); break; case ENOENT: /* negative match */ + getnanouptime(&now); + if (timespeccmp(&timeout, &now, <=)) { + /* Cache timeout */ + cache_purge_negative(dvp); + break; + } /* fall through */ default: return err; } - } - nid = VTOI(dvp); - fdisp_init(&fdi, cnp->cn_namelen + 1); - op = FUSE_LOOKUP; -calldaemon: - fdisp_make(&fdi, op, mp, nid, td, cred); + nid = VTOI(dvp); + fdisp_init(&fdi, cnp->cn_namelen + 1); + fdisp_make(&fdi, FUSE_LOOKUP, mp, nid, td, cred); - if (op == FUSE_LOOKUP) { memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; - } - lookup_err = fdisp_wait_answ(&fdi); + lookup_err = fdisp_wait_answ(&fdi); + did_lookup = true; - if ((op == FUSE_LOOKUP) && !lookup_err) { /* lookup call succeeded */ - nid = ((struct fuse_entry_out *)fdi.answ)->nodeid; - if (!nid) { - /* - * zero nodeid is the same as "not found", - * but it's also cacheable (which we keep - * keep on doing not as of writing this) - */ - lookup_err = ENOENT; - } else if (nid == FUSE_ROOT_ID) { - lookup_err = EINVAL; + if (!lookup_err) { + /* lookup call succeeded */ + feo = (struct fuse_entry_out *)fdi.answ; + nid = feo->nodeid; + if (nid == 0) { + /* zero nodeid means ENOENT and cache it */ + struct timespec timeout; + + fdi.answ_stat = ENOENT; + lookup_err = ENOENT; + if (cnp->cn_flags & MAKEENTRY) { + fuse_validity_2_timespec(feo, &timeout); + cache_enter_time(dvp, *vpp, cnp, + &timeout, NULL); + } + } else if (nid == FUSE_ROOT_ID) { + lookup_err = EINVAL; + } + vtyp = IFTOVT(feo->attr.mode); + filesize = feo->attr.size; } + if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT)) { + fdisp_destroy(&fdi); + return lookup_err; + } } - if (lookup_err && - (!fdi.answ_stat || lookup_err != ENOENT || op != FUSE_LOOKUP)) { - fdisp_destroy(&fdi); - return lookup_err; - } /* lookup_err, if non-zero, must be ENOENT at this point */ if (lookup_err) { + /* Entry not found */ + if ((nameiop == CREATE || nameiop == RENAME) && islastcn) { + err = fuse_internal_access(dvp, VWRITE, td, cred); + if (!err) { + /* + * Set the SAVENAME flag to hold onto the + * pathname for use later in VOP_CREATE or + * VOP_RENAME. + */ + cnp->cn_flags |= SAVENAME; - if ((nameiop == CREATE || nameiop == RENAME) && islastcn - /* && directory dvp has not been removed */ ) { - - if (vfs_isrdonly(mp)) { - err = EROFS; - goto out; + err = EJUSTRETURN; } -#if 0 /* THINK_ABOUT_THIS */ - if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) { - goto out; - } -#endif - - /* - * Possibly record the position of a slot in the - * directory large enough for the new component name. - * This can be recorded in the vnode private data for - * dvp. Set the SAVENAME flag to hold onto the - * pathname for use later in VOP_CREATE or VOP_RENAME. - */ - cnp->cn_flags |= SAVENAME; - - err = EJUSTRETURN; - goto out; - } - /* Consider inserting name into cache. */ - - /* - * No we can't use negative caching, as the fs - * changes are out of our control. - * False positives' falseness turns out just as things - * go by, but false negatives' falseness doesn't. - * (and aiding the caching mechanism with extra control - * mechanisms comes quite close to beating the whole purpose - * caching...) - */ -#if 0 - if ((cnp->cn_flags & MAKEENTRY) != 0) { - SDT_PROBE2(fuse, , vnops, trace, 1, - "inserting NULL into cache"); - cache_enter(dvp, NULL, cnp); - } -#endif - err = ENOENT; - goto out; - - } else { - - /* !lookup_err */ - - struct fuse_entry_out *feo = NULL; - struct fuse_attr *fattr = NULL; - - if (op == FUSE_GETATTR) { - fattr = &((struct fuse_attr_out *)fdi.answ)->attr; } else { - feo = (struct fuse_entry_out *)fdi.answ; - fattr = &(feo->attr); + err = ENOENT; } - - /* - * If deleting, and at end of pathname, return parameters - * which can be used to remove file. If the wantparent flag - * isn't set, we return only the directory, otherwise we go on - * and lock the inode, being careful with ".". - */ - if (nameiop == DELETE && islastcn) { - /* - * Check for write access on directory. - */ - facp.xuid = fattr->uid; - facp.facc_flags |= FACCESS_STICKY; - err = fuse_internal_access(dvp, VWRITE, &facp, td, cred); - facp.facc_flags &= ~FACCESS_XQUERIES; - - if (err) { - goto out; - } - if (nid == VTOI(dvp)) { - vref(dvp); - *vpp = dvp; - } else { - err = fuse_vnode_get(dvp->v_mount, feo, nid, - dvp, &vp, cnp, IFTOVT(fattr->mode)); - if (err) - goto out; - *vpp = vp; - } - - /* - * Save the name for use in VOP_RMDIR and VOP_REMOVE - * later. - */ - cnp->cn_flags |= SAVENAME; - goto out; - - } - /* - * If rewriting (RENAME), return the inode and the - * information required to rewrite the present directory - * Must get inode of directory entry to verify it's a - * regular file, or empty directory. - */ - if (nameiop == RENAME && wantparent && islastcn) { - -#if 0 /* THINK_ABOUT_THIS */ - if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) { - goto out; - } -#endif - - /* - * Check for "." - */ - if (nid == VTOI(dvp)) { - err = EISDIR; - goto out; - } - err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp, - &vp, cnp, IFTOVT(fattr->mode)); - if (err) { - goto out; - } - *vpp = vp; - /* - * Save the name for use in VOP_RENAME later. - */ - cnp->cn_flags |= SAVENAME; - - goto out; - } + } else { + /* Entry was found */ if (flags & ISDOTDOT) { - struct mount *mp; - int ltype; + struct fuse_lookup_alloc_arg flaa; - /* - * Expanded copy of vn_vget_ino() so that - * fuse_vnode_get() can be used. - */ - mp = dvp->v_mount; - ltype = VOP_ISLOCKED(dvp); - err = vfs_busy(mp, MBF_NOWAIT); - if (err != 0) { - vfs_ref(mp); - VOP_UNLOCK(dvp, 0); - err = vfs_busy(mp, 0); - vn_lock(dvp, ltype | LK_RETRY); - vfs_rel(mp); - if (err) - goto out; - if ((dvp->v_iflag & VI_DOOMED) != 0) { - err = ENOENT; - vfs_unbusy(mp); - goto out; - } - } - VOP_UNLOCK(dvp, 0); - err = fuse_vnode_get(vnode_mount(dvp), feo, nid, NULL, - &vp, cnp, IFTOVT(fattr->mode)); - vfs_unbusy(mp); - vn_lock(dvp, ltype | LK_RETRY); - if ((dvp->v_iflag & VI_DOOMED) != 0) { - if (err == 0) - vput(vp); - err = ENOENT; - } - if (err) - goto out; + flaa.nid = nid; + flaa.feo = feo; + flaa.cnp = cnp; + flaa.vtyp = vtyp; + err = vn_vget_ino_gen(dvp, fuse_lookup_alloc, &flaa, 0, + &vp); *vpp = vp; } else if (nid == VTOI(dvp)) { vref(dvp); @@ -939,25 +1140,26 @@ struct fuse_vnode_data *fvdat; err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp, - &vp, cnp, IFTOVT(fattr->mode)); - if (err) { + &vp, cnp, vtyp); + if (err) goto out; - } - fuse_vnode_setparent(vp, dvp); + *vpp = vp; /* * In the case where we are looking up a FUSE node * represented by an existing cached vnode, and the * true size reported by FUSE_LOOKUP doesn't match - * the vnode's cached size, fix the vnode cache to - * match the real object size. + * the vnode's cached size, then any cached writes + * beyond the file's current size are lost. * - * This can occur via FUSE distributed filesystems, - * irregular files, etc. + * We can get here: + * * following attribute cache expiration, or + * * due a bug in the daemon, or */ fvdat = VTOFUD(vp); if (vnode_isreg(vp) && - fattr->size != fvdat->filesize) { + filesize != fvdat->cached_attrs.va_size && + fvdat->flag & FN_SIZECHANGE) { /* * The FN_SIZECHANGE flag reflects a dirty * append. If userspace lets us know our cache @@ -967,131 +1169,64 @@ * * XXX: Maybe disable WB caching on this mount. */ - if (fvdat->flag & FN_SIZECHANGE) - printf("%s: WB cache incoherent on " - "%s!\n", __func__, - vnode_mount(vp)->mnt_stat.f_mntonname); + printf("%s: WB cache incoherent on %s!\n", + __func__, + vnode_mount(vp)->mnt_stat.f_mntonname); - (void)fuse_vnode_setsize(vp, fattr->size); fvdat->flag &= ~FN_SIZECHANGE; } - *vpp = vp; - } - if (op == FUSE_GETATTR) { - struct fuse_attr_out *fao = - (struct fuse_attr_out*)fdi.answ; - fuse_internal_cache_attrs(*vpp, - &fao->attr, fao->attr_valid, - fao->attr_valid_nsec, NULL); - } else { - struct fuse_entry_out *feo = - (struct fuse_entry_out*)fdi.answ; - fuse_internal_cache_attrs(*vpp, - &feo->attr, feo->attr_valid, - feo->attr_valid_nsec, NULL); - } + MPASS(feo != NULL); + fuse_internal_cache_attrs(*vpp, &feo->attr, + feo->attr_valid, feo->attr_valid_nsec, NULL); + fuse_validity_2_bintime(feo->entry_valid, + feo->entry_valid_nsec, + &fvdat->entry_cache_timeout); - /* Insert name into cache if appropriate. */ + if ((nameiop == DELETE || nameiop == RENAME) && + islastcn) + { + struct vattr dvattr; - /* - * Nooo, caching is evil. With caching, we can't avoid stale - * information taking over the playground (cached info is not - * just positive/negative, it does have qualitative aspects, - * too). And a (VOP/FUSE)_GETATTR is always thrown anyway, when - * walking down along cached path components, and that's not - * any cheaper than FUSE_LOOKUP. This might change with - * implementing kernel side attr caching, but... In Linux, - * lookup results are not cached, and the daemon is bombarded - * with FUSE_LOOKUPS on and on. This shows that by design, the - * daemon is expected to handle frequent lookup queries - * efficiently, do its caching in userspace, and so on. - * - * So just leave the name cache alone. - */ - - /* - * Well, now I know, Linux caches lookups, but with a - * timeout... So it's the same thing as attribute caching: - * we can deal with it when implement timeouts. - */ -#if 0 - if (cnp->cn_flags & MAKEENTRY) { - cache_enter(dvp, *vpp, cnp); - } -#endif - } -out: - if (!lookup_err) { - - /* No lookup error; need to clean up. */ - - if (err) { /* Found inode; exit with no vnode. */ - if (op == FUSE_LOOKUP) { - fuse_internal_forget_send(vnode_mount(dvp), td, cred, - nid, 1); - } - fdisp_destroy(&fdi); - return err; - } else { -#ifndef NO_EARLY_PERM_CHECK_HACK - if (!islastcn) { - /* - * We have the attributes of the next item - * *now*, and it's a fact, and we do not - * have to do extra work for it (ie, beg the - * daemon), and it neither depends on such - * accidental things like attr caching. So - * the big idea: check credentials *now*, - * not at the beginning of the next call to - * lookup. - * - * The first item of the lookup chain (fs root) - * won't be checked then here, of course, as - * its never "the next". But go and see that - * the root is taken care about at the very - * beginning of this function. - * - * Now, given we want to do the access check - * this way, one might ask: so then why not - * do the access check just after fetching - * the inode and its attributes from the - * daemon? Why bother with producing the - * corresponding vnode at all if something - * is not OK? We know what's the deal as - * soon as we get those attrs... There is - * one bit of info though not given us by - * the daemon: whether his response is - * authoritative or not... His response should - * be ignored if something is mounted over - * the dir in question. But that can be - * known only by having the vnode... + err = fuse_internal_access(dvp, VWRITE, td, + cred); + if (err != 0) + goto out; + /* + * if the parent's sticky bit is set, check + * whether we're allowed to remove the file. + * Need to figure out the vnode locking to make + * this work. */ - int tmpvtype = vnode_vtype(*vpp); - - bzero(&facp, sizeof(facp)); - /*the early perm check hack */ - facp.facc_flags |= FACCESS_VA_VALID; - - if ((tmpvtype != VDIR) && (tmpvtype != VLNK)) { - err = ENOTDIR; + fuse_internal_getattr(dvp, &dvattr, cred, td); + if ((dvattr.va_mode & S_ISTXT) && + fuse_internal_access(dvp, VADMIN, td, + cred) && + fuse_internal_access(*vpp, VADMIN, td, + cred)) { + err = EPERM; + goto out; } - if (!err && !vnode_mountedhere(*vpp)) { - err = fuse_internal_access(*vpp, VEXEC, &facp, td, cred); - } - if (err) { - if (tmpvtype == VLNK) - SDT_PROBE2(fuse, , vnops, trace, - 1, "weird, permission " - "error with a symlink?"); - vput(*vpp); - *vpp = NULL; - } } -#endif + + if (islastcn && ( + (nameiop == DELETE) || + (nameiop == RENAME && wantparent))) { + cnp->cn_flags |= SAVENAME; + } + } } - fdisp_destroy(&fdi); +out: + if (err) { + if (vp != NULL && dvp != vp) + vput(vp); + else if (vp != NULL) + vrele(vp); + *vpp = NULL; + } + if (did_lookup) + fdisp_destroy(&fdi); return err; } @@ -1118,6 +1253,7 @@ return ENXIO; } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); + fmdi.umask = curthread->td_proc->p_fd->fd_cmask; return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi, sizeof(fmdi), VDIR)); @@ -1135,12 +1271,19 @@ fuse_vnop_mknod(struct vop_mknod_args *ap) { - return (EINVAL); -} + struct vnode *dvp = ap->a_dvp; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + struct vattr *vap = ap->a_vap; + if (fuse_isdeadfs(dvp)) + return ENXIO; + return fuse_internal_mknod(dvp, vpp, cnp, vap); +} + /* - struct vnop_open_args { + struct vop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; @@ -1152,50 +1295,27 @@ fuse_vnop_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; - int mode = ap->a_mode; + int a_mode = ap->a_mode; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; - - fufh_type_t fufh_type; + pid_t pid = td->td_proc->p_pid; struct fuse_vnode_data *fvdat; - int error, isdir = 0; - int32_t fuse_open_flags; - - if (fuse_isdeadfs(vp)) { + if (fuse_isdeadfs(vp)) return ENXIO; - } - if ((mode & (FREAD | FWRITE)) == 0) + if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO) + return (EOPNOTSUPP); + if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0) return EINVAL; fvdat = VTOFUD(vp); - if (vnode_isdir(vp)) { - isdir = 1; - } - fuse_open_flags = 0; - if (isdir) { - fufh_type = FUFH_RDONLY; - } else { - fufh_type = fuse_filehandle_xlate_from_fflags(mode); - /* - * For WRONLY opens, force DIRECT_IO. This is necessary - * since writing a partial block through the buffer cache - * will result in a read of the block and that read won't - * be allowed by the WRONLY open. - */ - if (fufh_type == FUFH_WRONLY || - (fvdat->flag & FN_DIRECTIO) != 0) - fuse_open_flags = FOPEN_DIRECT_IO; - } - - if (fuse_filehandle_validrw(vp, fufh_type) != FUFH_INVALID) { - fuse_vnode_open(vp, fuse_open_flags, td); + if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) { + fuse_vnode_open(vp, 0, td); return 0; } - error = fuse_filehandle_open(vp, fufh_type, NULL, td, cred); - return error; + return fuse_filehandle_open(vp, a_mode, NULL, td, cred); } static int @@ -1238,6 +1358,7 @@ struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; + pid_t pid = curthread->td_proc->p_pid; if (fuse_isdeadfs(vp)) { return ENXIO; @@ -1247,7 +1368,7 @@ ioflag |= IO_DIRECT; } - return fuse_io_dispatch(vp, uio, ioflag, cred); + return fuse_io_dispatch(vp, uio, ioflag, cred, pid); } /* @@ -1256,7 +1377,7 @@ struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; - int *ncookies; + int *a_ncookies; u_long **a_cookies; }; */ @@ -1266,13 +1387,18 @@ struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; - struct fuse_filehandle *fufh = NULL; struct fuse_iov cookediov; - int err = 0; - int freefufh = 0; + u_long *cookies; + off_t startoff; + ssize_t tresid; + int ncookies; + bool closefufh = false; + pid_t pid = curthread->td_proc->p_pid; + if (ap->a_eofflag) + *ap->a_eofflag = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } @@ -1281,26 +1407,61 @@ return EINVAL; } - if (!fuse_filehandle_valid(vp, FUFH_RDONLY)) { - SDT_PROBE2(fuse, , vnops, trace, 1, - "calling readdir() before open()"); - err = fuse_filehandle_open(vp, FUFH_RDONLY, &fufh, NULL, cred); - freefufh = 1; - } else { - err = fuse_filehandle_get(vp, FUFH_RDONLY, &fufh); + tresid = uio->uio_resid; + startoff = uio->uio_offset; + err = fuse_filehandle_get_dir(vp, &fufh, cred, pid); + if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { + /* + * nfsd will do VOP_READDIR without first doing VOP_OPEN. We + * must implicitly open the directory here + */ + err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred); + if (err == 0) { + /* + * When a directory is opened, it must be read from + * the beginning. Hopefully, the "startoff" still + * exists as an offset cookie for the directory. + * If not, it will read the entire directory without + * returning any entries and just return eof. + */ + uio->uio_offset = 0; + } + closefufh = true; } - if (err) { + if (err) return (err); + if (ap->a_ncookies != NULL) { + ncookies = uio->uio_resid / + (offsetof(struct dirent, d_name) + 4) + 1; + cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); + *ap->a_ncookies = ncookies; + *ap->a_cookies = cookies; + } else { + ncookies = 0; + cookies = NULL; } #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1) fiov_init(&cookediov, DIRCOOKEDSIZE); - err = fuse_internal_readdir(vp, uio, fufh, &cookediov); + err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov, + &ncookies, cookies); fiov_teardown(&cookediov); - if (freefufh) { - fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); + if (closefufh) + fuse_filehandle_close(vp, fufh, curthread, cred); + + if (ap->a_ncookies != NULL) { + if (err == 0) { + *ap->a_ncookies -= ncookies; + } else { + free(*ap->a_cookies, M_TEMP); + *ap->a_ncookies = 0; + *ap->a_cookies = NULL; + } } + if (err == 0 && tresid == uio->uio_resid) + *ap->a_eofflag = 1; + return err; } @@ -1357,22 +1518,16 @@ { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; - struct fuse_vnode_data *fvdat = VTOFUD(vp); - struct fuse_filehandle *fufh = NULL; + struct fuse_filehandle *fufh, *fufh_tmp; - int type; - if (!fvdat) { panic("FUSE: no vnode data during recycling"); } - for (type = 0; type < FUFH_MAXTYPE; type++) { - fufh = &(fvdat->fufh[type]); - if (FUFH_IS_VALID(fufh)) { - printf("FUSE: vnode being reclaimed but fufh (type=%d) is valid", - type); - fuse_filehandle_close(vp, type, td, NULL); - } + LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { + printf("FUSE: vnode being reclaimed with open fufh " + "(type=%#x)", fufh->fufh_type); + fuse_filehandle_close(vp, fufh, td, NULL); } if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) { @@ -1410,12 +1565,9 @@ if (vnode_isdir(vp)) { return EPERM; } - cache_purge(vp); err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); - if (err == 0) - fuse_internal_vnode_disappear(vp); return err; } @@ -1439,7 +1591,8 @@ struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct fuse_data *data; - + bool newparent = fdvp != tdvp; + bool isdir = fvp->v_type == VDIR; int err = 0; if (fuse_isdeadfs(fdvp)) { @@ -1447,7 +1600,7 @@ } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { - SDT_PROBE2(fuse, , vnops, trace, 1, "cross-device rename"); + SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename"); err = EXDEV; goto out; } @@ -1458,7 +1611,17 @@ * under the source directory in the file system tree. * Linux performs this check at VFS level. */ + /* + * If source is a directory, and it will get a new parent, user must + * have write permission to it, so ".." can be modified. + */ data = fuse_get_mpdata(vnode_mount(tdvp)); + if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) { + err = fuse_internal_access(fvp, VWRITE, + tcnp->cn_thread, tcnp->cn_cred); + if (err) + goto out; + } sx_xlock(&data->rename_lock); err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp); if (err == 0) { @@ -1516,8 +1679,6 @@ } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); - if (err == 0) - fuse_internal_vnode_disappear(vp); return err; } @@ -1536,129 +1697,137 @@ struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; + struct mount *mp; + struct fuse_data *data; + struct vattr old_va; + int dataflags; + int err = 0, err2; + accmode_t accmode = 0; + bool checkperm; + bool drop_suid = false; + gid_t cr_gid; - struct fuse_dispatcher fdi; - struct fuse_setattr_in *fsai; - struct fuse_access_param facp; + mp = vnode_mount(vp); + data = fuse_get_mpdata(mp); + dataflags = data->dataflags; + checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS; + if (cred->cr_ngroups > 0) + cr_gid = cred->cr_groups[0]; + else + cr_gid = 0; - int err = 0; - enum vtype vtyp; - int sizechanged = 0; - uint64_t newsize = 0; - if (fuse_isdeadfs(vp)) { return ENXIO; } - fdisp_init(&fdi, sizeof(*fsai)); - fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred); - fsai = fdi.indata; - fsai->valid = 0; - bzero(&facp, sizeof(facp)); - - facp.xuid = vap->va_uid; - facp.xgid = vap->va_gid; - if (vap->va_uid != (uid_t)VNOVAL) { - facp.facc_flags |= FACCESS_CHOWN; - fsai->uid = vap->va_uid; - fsai->valid |= FATTR_UID; + if (checkperm) { + /* Only root may change a file's owner */ + err = priv_check_cred(cred, PRIV_VFS_CHOWN); + if (err) { + /* As a special case, allow the null chown */ + err2 = fuse_internal_getattr(vp, &old_va, cred, + td); + if (err2) + return (err2); + if (vap->va_uid != old_va.va_uid) + return err; + else + accmode |= VADMIN; + drop_suid = true; + } else + accmode |= VADMIN; + } else + accmode |= VADMIN; } if (vap->va_gid != (gid_t)VNOVAL) { - facp.facc_flags |= FACCESS_CHOWN; - fsai->gid = vap->va_gid; - fsai->valid |= FATTR_GID; + if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN)) + drop_suid = true; + if (checkperm && !groupmember(vap->va_gid, cred)) + { + /* + * Non-root users may only chgrp to one of their own + * groups + */ + err = priv_check_cred(cred, PRIV_VFS_CHOWN); + if (err) { + /* As a special case, allow the null chgrp */ + err2 = fuse_internal_getattr(vp, &old_va, cred, + td); + if (err2) + return (err2); + if (vap->va_gid != old_va.va_gid) + return err; + accmode |= VADMIN; + } else + accmode |= VADMIN; + } else + accmode |= VADMIN; } if (vap->va_size != VNOVAL) { - - struct fuse_filehandle *fufh = NULL; - - /*Truncate to a new value. */ - fsai->size = vap->va_size; - sizechanged = 1; - newsize = vap->va_size; - fsai->valid |= FATTR_SIZE; - - fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh); - if (fufh) { - fsai->fh = fufh->fh_id; - fsai->valid |= FATTR_FH; + switch (vp->v_type) { + case VDIR: + return (EISDIR); + case VLNK: + case VREG: + if (vfs_isrdonly(mp)) + return (EROFS); + break; + default: + /* + * According to POSIX, the result is unspecified + * for file types other than regular files, + * directories and shared memory objects. We + * don't support shared memory objects in the file + * system, and have dubious support for truncating + * symlinks. Just ignore the request in other cases. + */ + return (0); } + /* Don't set accmode. Permission to trunc is checked upstack */ } - if (vap->va_atime.tv_sec != VNOVAL) { - fsai->atime = vap->va_atime.tv_sec; - fsai->atimensec = vap->va_atime.tv_nsec; - fsai->valid |= FATTR_ATIME; + if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { + if (vap->va_vaflags & VA_UTIMES_NULL) + accmode |= VWRITE; + else + accmode |= VADMIN; } - if (vap->va_mtime.tv_sec != VNOVAL) { - fsai->mtime = vap->va_mtime.tv_sec; - fsai->mtimensec = vap->va_mtime.tv_nsec; - fsai->valid |= FATTR_MTIME; + if (drop_suid) { + if (vap->va_mode != (mode_t)VNOVAL) + vap->va_mode &= ~(S_ISUID | S_ISGID); + else { + err = fuse_internal_getattr(vp, &old_va, cred, td); + if (err) + return (err); + vap->va_mode = old_va.va_mode & ~(S_ISUID | S_ISGID); + } } if (vap->va_mode != (mode_t)VNOVAL) { - fsai->mode = vap->va_mode & ALLPERMS; - fsai->valid |= FATTR_MODE; + /* Only root may set the sticky bit on non-directories */ + if (checkperm && vp->v_type != VDIR && (vap->va_mode & S_ISTXT) + && priv_check_cred(cred, PRIV_VFS_STICKYFILE)) + return EFTYPE; + if (checkperm && (vap->va_mode & S_ISGID)) { + err = fuse_internal_getattr(vp, &old_va, cred, td); + if (err) + return (err); + if (!groupmember(old_va.va_gid, cred)) { + err = priv_check_cred(cred, PRIV_VFS_SETGID); + if (err) + return (err); + } + } + accmode |= VADMIN; } - if (!fsai->valid) { - goto out; - } - vtyp = vnode_vtype(vp); - if (fsai->valid & FATTR_SIZE && vtyp == VDIR) { - err = EISDIR; - goto out; - } - if (vfs_isrdonly(vnode_mount(vp)) && (fsai->valid & ~FATTR_SIZE || vtyp == VREG)) { - err = EROFS; - goto out; - } - if (fsai->valid & ~FATTR_SIZE) { - /*err = fuse_internal_access(vp, VADMIN, context, &facp); */ - /*XXX */ - err = 0; - } - facp.facc_flags &= ~FACCESS_XQUERIES; + if (vfs_isrdonly(mp)) + return EROFS; - if (err && !(fsai->valid & ~(FATTR_ATIME | FATTR_MTIME)) && - vap->va_vaflags & VA_UTIMES_NULL) { - err = fuse_internal_access(vp, VWRITE, &facp, td, cred); - } + err = fuse_internal_access(vp, accmode, td, cred); if (err) - goto out; - if ((err = fdisp_wait_answ(&fdi))) - goto out; - vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode); - - if (vnode_vtype(vp) != vtyp) { - if (vnode_vtype(vp) == VNON && vtyp != VNON) { - SDT_PROBE2(fuse, , vnops, trace, 1, "FUSE: Dang! " - "vnode_vtype is VNON and vtype isn't."); - } else { - /* - * STALE vnode, ditch - * - * The vnode has changed its type "behind our back". - * There's nothing really we can do, so let us just - * force an internal revocation and tell the caller to - * try again, if interested. - */ - fuse_internal_vnode_disappear(vp); - err = EAGAIN; - } - } - if (err == 0) { - struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ; - fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, - fao->attr_valid_nsec, NULL); - } - -out: - fdisp_destroy(&fdi); - if (!err && sizechanged) { - fuse_vnode_setsize(vp, newsize); - VTOFUD(vp)->flag &= ~FN_SIZECHANGE; - } - return err; + return err; + else + return fuse_internal_setattr(vp, vap, td, cred); } /* @@ -1677,22 +1846,15 @@ bp->b_ioflags |= BIO_ERROR; bp->b_error = ENXIO; bufdone(bp); - return ENXIO; + return 0; } - if (bp->b_iocmd == BIO_WRITE) - fuse_vnode_refreshsize(vp, NOCRED); - (void)fuse_io_strategy(vp, bp); - /* - * This is a dangerous function. If returns error, that might mean a - * panic. We prefer pretty much anything over being forced to panic - * by a malicious daemon (a demon?). So we just return 0 anyway. You - * should never mind this: this function has its own error - * propagation mechanism via the argument buffer, so - * not-that-melodramatic residents of the call chain still will be - * able to know what to do. + * VOP_STRATEGY always returns zero and signals error via bp->b_ioflags. + * fuse_io_strategy sets bp's error fields */ + (void)fuse_io_strategy(vp, bp); + return 0; } @@ -1758,237 +1920,70 @@ struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; + pid_t pid = curthread->td_proc->p_pid; if (fuse_isdeadfs(vp)) { return ENXIO; } - fuse_vnode_refreshsize(vp, cred); if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } - return fuse_io_dispatch(vp, uio, ioflag, cred); + return fuse_io_dispatch(vp, uio, ioflag, cred, pid); } -SDT_PROBE_DEFINE1(fuse, , vnops, vnop_getpages_error, "int"); -/* - struct vnop_getpages_args { - struct vnode *a_vp; - vm_page_t *a_m; - int a_count; - int a_reqpage; - }; -*/ -static int -fuse_vnop_getpages(struct vop_getpages_args *ap) +static daddr_t +fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { - int i, error, nextoff, size, toff, count, npages; - struct uio uio; - struct iovec iov; - vm_offset_t kva; - struct buf *bp; - struct vnode *vp; - struct thread *td; - struct ucred *cred; - vm_page_t *pages; + const int biosize = fuse_iosize(vp); - vp = ap->a_vp; - KASSERT(vp->v_object, ("objectless vp passed to getpages")); - td = curthread; /* XXX */ - cred = curthread->td_ucred; /* XXX */ - pages = ap->a_m; - npages = ap->a_count; + return (off / biosize); +} - if (!fsess_opt_mmap(vnode_mount(vp))) { - SDT_PROBE2(fuse, , vnops, trace, 1, - "called on non-cacheable vnode??\n"); - return (VM_PAGER_ERROR); - } +static int +fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn) +{ + off_t filesize; + int blksz, err; + const int biosize = fuse_iosize(vp); - /* - * If the last page is partially valid, just return it and allow - * the pager to zero-out the blanks. Partially valid pages can - * only occur at the file EOF. - * - * XXXGL: is that true for FUSE, which is a local filesystem, - * but still somewhat disconnected from the kernel? - */ - VM_OBJECT_WLOCK(vp->v_object); - if (pages[npages - 1]->valid != 0 && --npages == 0) - goto out; - VM_OBJECT_WUNLOCK(vp->v_object); + err = fuse_vnode_size(vp, &filesize, NULL, NULL); + KASSERT(err == 0, ("vfs_bio_getpages can't handle errors here")); + if (err) + return biosize; - /* - * We use only the kva address for the buffer, but this is extremely - * convenient and fast. - */ - bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK); - - kva = (vm_offset_t)bp->b_data; - pmap_qenter(kva, pages, npages); - VM_CNT_INC(v_vnodein); - VM_CNT_ADD(v_vnodepgsin, npages); - - count = npages << PAGE_SHIFT; - iov.iov_base = (caddr_t)kva; - iov.iov_len = count; - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); - uio.uio_resid = count; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = UIO_READ; - uio.uio_td = td; - - error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); - pmap_qremove(kva, npages); - - uma_zfree(fuse_pbuf_zone, bp); - - if (error && (uio.uio_resid == count)) { - SDT_PROBE1(fuse, , vnops, vnop_getpages_error, error); - return VM_PAGER_ERROR; + if ((off_t)lbn * biosize >= filesize) { + blksz = 0; + } else if ((off_t)(lbn + 1) * biosize > filesize) { + blksz = filesize - (off_t)lbn *biosize; + } else { + blksz = biosize; } - /* - * Calculate the number of bytes read and validate only that number - * of bytes. Note that due to pending writes, size may be 0. This - * does not mean that the remaining data is invalid! - */ - - size = count - uio.uio_resid; - VM_OBJECT_WLOCK(vp->v_object); - fuse_vm_page_lock_queues(); - for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { - vm_page_t m; - - nextoff = toff + PAGE_SIZE; - m = pages[i]; - - if (nextoff <= size) { - /* - * Read operation filled an entire page - */ - m->valid = VM_PAGE_BITS_ALL; - KASSERT(m->dirty == 0, - ("fuse_getpages: page %p is dirty", m)); - } else if (size > toff) { - /* - * Read operation filled a partial page. - */ - m->valid = 0; - vm_page_set_valid_range(m, 0, size - toff); - KASSERT(m->dirty == 0, - ("fuse_getpages: page %p is dirty", m)); - } else { - /* - * Read operation was short. If no error occurred - * we may have hit a zero-fill section. We simply - * leave valid set to 0. - */ - ; - } - } - fuse_vm_page_unlock_queues(); -out: - VM_OBJECT_WUNLOCK(vp->v_object); - if (ap->a_rbehind) - *ap->a_rbehind = 0; - if (ap->a_rahead) - *ap->a_rahead = 0; - return (VM_PAGER_OK); + return (blksz); } /* - struct vnop_putpages_args { + struct vnop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; - int a_sync; - int *a_rtvals; - vm_ooffset_t a_offset; + int a_reqpage; }; */ static int -fuse_vnop_putpages(struct vop_putpages_args *ap) +fuse_vnop_getpages(struct vop_getpages_args *ap) { - struct uio uio; - struct iovec iov; - vm_offset_t kva; - struct buf *bp; - int i, error, npages, count; - off_t offset; - int *rtvals; - struct vnode *vp; - struct thread *td; - struct ucred *cred; - vm_page_t *pages; - vm_ooffset_t fsize; + struct vnode *vp = ap->a_vp; - vp = ap->a_vp; - KASSERT(vp->v_object, ("objectless vp passed to putpages")); - fsize = vp->v_object->un_pager.vnp.vnp_size; - td = curthread; /* XXX */ - cred = curthread->td_ucred; /* XXX */ - pages = ap->a_m; - count = ap->a_count; - rtvals = ap->a_rtvals; - npages = btoc(count); - offset = IDX_TO_OFF(pages[0]->pindex); - if (!fsess_opt_mmap(vnode_mount(vp))) { - SDT_PROBE2(fuse, , vnops, trace, 1, + SDT_PROBE2(fusefs, , vnops, trace, 1, "called on non-cacheable vnode??\n"); + return (VM_PAGER_ERROR); } - for (i = 0; i < npages; i++) - rtvals[i] = VM_PAGER_AGAIN; - /* - * When putting pages, do not extend file past EOF. - */ - - if (offset + count > fsize) { - count = fsize - offset; - if (count < 0) - count = 0; - } - /* - * We use only the kva address for the buffer, but this is extremely - * convenient and fast. - */ - bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK); - - kva = (vm_offset_t)bp->b_data; - pmap_qenter(kva, pages, npages); - VM_CNT_INC(v_vnodeout); - VM_CNT_ADD(v_vnodepgsout, count); - - iov.iov_base = (caddr_t)kva; - iov.iov_len = count; - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = offset; - uio.uio_resid = count; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = UIO_WRITE; - uio.uio_td = td; - - error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); - - pmap_qremove(kva, npages); - uma_zfree(fuse_pbuf_zone, bp); - - if (!error) { - int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; - - for (i = 0; i < nwritten; i++) { - rtvals[i] = VM_PAGER_OK; - VM_OBJECT_WLOCK(pages[i]->object); - vm_page_undirty(pages[i]); - VM_OBJECT_WUNLOCK(pages[i]->object); - } - } - return rtvals[0]; + return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz)); } static const char extattr_namespace_separator = '.'; @@ -2024,6 +2019,13 @@ if (fuse_isdeadfs(vp)) return (ENXIO); + if (!fsess_isimpl(mp, FUSE_GETXATTR)) + return EOPNOTSUPP; + + err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); + if (err) + return err; + /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; @@ -2054,8 +2056,10 @@ err = fdisp_wait_answ(&fdi); if (err != 0) { - if (err == ENOSYS) + if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_GETXATTR); + err = EOPNOTSUPP; + } goto out; } @@ -2101,6 +2105,29 @@ if (fuse_isdeadfs(vp)) return (ENXIO); + if (!fsess_isimpl(mp, FUSE_SETXATTR)) + return EOPNOTSUPP; + + if (vfs_isrdonly(mp)) + return EROFS; + + /* Deleting xattrs must use VOP_DELETEEXTATTR instead */ + if (ap->a_uio == NULL) { + /* + * If we got here as fallback from VOP_DELETEEXTATTR, then + * return EOPNOTSUPP. + */ + if (!fsess_isimpl(mp, FUSE_REMOVEXATTR)) + return (EOPNOTSUPP); + else + return (EINVAL); + } + + err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, + VWRITE); + if (err) + return err; + /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; @@ -2128,11 +2155,14 @@ err = fdisp_wait_answ(&fdi); - if (err != 0) { - if (err == ENOSYS) - fsess_set_notimpl(mp, FUSE_SETXATTR); - goto out; + if (err == ENOSYS) { + fsess_set_notimpl(mp, FUSE_SETXATTR); + err = EOPNOTSUPP; } + if (err == ERESTART) { + /* Can't restart after calling uiomove */ + err = EINTR; + } out: fdisp_destroy(&fdi); @@ -2228,6 +2258,13 @@ if (fuse_isdeadfs(vp)) return (ENXIO); + if (!fsess_isimpl(mp, FUSE_LISTXATTR)) + return EOPNOTSUPP; + + err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); + if (err) + return err; + /* * Add space for a NUL and the period separator if enabled. * Default to looking for user attributes. @@ -2252,8 +2289,10 @@ err = fdisp_wait_answ(&fdi); if (err != 0) { - if (err == ENOSYS) + if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LISTXATTR); + err = EOPNOTSUPP; + } goto out; } @@ -2268,7 +2307,7 @@ /* * Retrieve Linux / FUSE compatible list values. */ - fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); + fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); list_xattr_in = fdi.indata; list_xattr_in->size = linux_list_len + sizeof(*list_xattr_out); attr_str = (char *)fdi.indata + sizeof(*list_xattr_in); @@ -2331,6 +2370,17 @@ if (fuse_isdeadfs(vp)) return (ENXIO); + if (!fsess_isimpl(mp, FUSE_REMOVEXATTR)) + return EOPNOTSUPP; + + if (vfs_isrdonly(mp)) + return EROFS; + + err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, + VWRITE); + if (err) + return err; + /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; @@ -2348,9 +2398,9 @@ ap->a_name); err = fdisp_wait_answ(&fdi); - if (err != 0) { - if (err == ENOSYS) - fsess_set_notimpl(mp, FUSE_REMOVEXATTR); + if (err == ENOSYS) { + fsess_set_notimpl(mp, FUSE_REMOVEXATTR); + err = EOPNOTSUPP; } fdisp_destroy(&fdi); @@ -2374,3 +2424,48 @@ return 0; } + +/* + * Get an NFS filehandle for a FUSE file. + * + * This will only work for FUSE file systems that guarantee the uniqueness of + * nodeid:generation, which most don't. + */ +/* +vop_vptofh { + IN struct vnode *a_vp; + IN struct fid *a_fhp; +}; +*/ +static int +fuse_vnop_vptofh(struct vop_vptofh_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_fid *fhp = (struct fuse_fid *)(ap->a_fhp); + _Static_assert(sizeof(struct fuse_fid) <= sizeof(struct fid), + "FUSE fid type is too big"); + struct mount *mp = vnode_mount(vp); + struct fuse_data *data = fuse_get_mpdata(mp); + struct vattr va; + int err; + + if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) + return EOPNOTSUPP; + + err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread); + if (err) + return err; + + /*ip = VTOI(ap->a_vp);*/ + /*ufhp = (struct ufid *)ap->a_fhp;*/ + fhp->len = sizeof(struct fuse_fid); + fhp->nid = fvdat->nid; + if (fvdat->generation <= UINT32_MAX) + fhp->gen = fvdat->generation; + else + return EOVERFLOW; + return (0); +} + + Index: sys/sys/param.h =================================================================== --- sys/sys/param.h +++ sys/sys/param.h @@ -60,7 +60,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1300038 /* Master, propagated to newvers */ +#define __FreeBSD_version 1300039 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, Index: tests/sys/fs/Makefile =================================================================== --- tests/sys/fs/Makefile +++ tests/sys/fs/Makefile @@ -1,5 +1,7 @@ # $FreeBSD$ +.include + PACKAGE= tests TESTSDIR= ${TESTSBASE}/sys/fs @@ -7,6 +9,9 @@ TESTSRC= ${SRCTOP}/contrib/netbsd-tests/fs #TESTS_SUBDIRS+= nullfs # XXX: needs rump +.if ${COMPILER_FEATURES:Mc++14} +TESTS_SUBDIRS+= fusefs +.endif TESTS_SUBDIRS+= tmpfs ${PACKAGE}FILES+= h_funcs.subr