Index: projects/fuse2/sys/fs/fuse/fuse_device.c =================================================================== --- projects/fuse2/sys/fs/fuse/fuse_device.c (revision 346338) +++ projects/fuse2/sys/fs/fuse/fuse_device.c (revision 346339) @@ -1,460 +1,459 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_ipc.h" SDT_PROVIDER_DECLARE(fuse); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fuse, , device, trace, "int", "char*"); static struct cdev *fuse_dev; static d_open_t fuse_device_open; static d_close_t fuse_device_close; static d_poll_t fuse_device_poll; static d_read_t fuse_device_read; static d_write_t fuse_device_write; static struct cdevsw fuse_device_cdevsw = { .d_open = fuse_device_open, .d_close = fuse_device_close, .d_name = "fuse", .d_poll = fuse_device_poll, .d_read = fuse_device_read, .d_write = fuse_device_write, .d_version = D_VERSION, }; /**************************** * * >>> Fuse device op defs * ****************************/ static void fdata_dtor(void *arg) { struct fuse_data *fdata; fdata = arg; fdata_trydestroy(fdata); } /* * Resources are set up on a per-open basis */ static int fuse_device_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct fuse_data *fdata; int error; SDT_PROBE2(fuse, , device, trace, 1, "device open"); fdata = fdata_alloc(dev, td->td_ucred); error = devfs_set_cdevpriv(fdata, fdata_dtor); if (error != 0) fdata_trydestroy(fdata); else SDT_PROBE2(fuse, , device, trace, 1, "device open success"); return (error); } static int fuse_device_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { struct fuse_data *data; struct fuse_ticket *tick; int error; error = devfs_get_cdevpriv((void **)&data); if (error != 0) return (error); if (!data) panic("no fuse data upon fuse device close"); fdata_set_dead(data); FUSE_LOCK(); fuse_lck_mtx_lock(data->aw_mtx); /* wakup poll()ers */ selwakeuppri(&data->ks_rsel, PZERO + 1); /* Don't let syscall handlers wait in vain */ while ((tick = fuse_aw_pop(data))) { fuse_lck_mtx_lock(tick->tk_aw_mtx); fticket_set_answered(tick); tick->tk_aw_errno = ENOTCONN; wakeup(tick); fuse_lck_mtx_unlock(tick->tk_aw_mtx); FUSE_ASSERT_AW_DONE(tick); fuse_ticket_drop(tick); } fuse_lck_mtx_unlock(data->aw_mtx); FUSE_UNLOCK(); SDT_PROBE2(fuse, , device, trace, 1, "device close"); return (0); } int fuse_device_poll(struct cdev *dev, int events, struct thread *td) { struct fuse_data *data; int error, revents = 0; error = devfs_get_cdevpriv((void **)&data); if (error != 0) return (events & (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); if (events & (POLLIN | POLLRDNORM)) { fuse_lck_mtx_lock(data->ms_mtx); if (fdata_get_dead(data) || STAILQ_FIRST(&data->ms_head)) revents |= events & (POLLIN | POLLRDNORM); else selrecord(td, &data->ks_rsel); fuse_lck_mtx_unlock(data->ms_mtx); } if (events & (POLLOUT | POLLWRNORM)) { revents |= events & (POLLOUT | POLLWRNORM); } return (revents); } /* * fuse_device_read hangs on the queue of VFS messages. * When it's notified that there is a new one, it picks that and * passes up to the daemon */ int fuse_device_read(struct cdev *dev, struct uio *uio, int ioflag) { int err; struct fuse_data *data; struct fuse_ticket *tick; void *buf[] = {NULL, NULL, NULL}; int buflen[3]; int i; SDT_PROBE2(fuse, , device, trace, 1, "fuse device read"); err = devfs_get_cdevpriv((void **)&data); if (err != 0) return (err); fuse_lck_mtx_lock(data->ms_mtx); again: if (fdata_get_dead(data)) { SDT_PROBE2(fuse, , device, trace, 2, "we know early on that reader should be kicked so we " "don't wait for news"); fuse_lck_mtx_unlock(data->ms_mtx); return (ENODEV); } if (!(tick = fuse_ms_pop(data))) { /* check if we may block */ if (ioflag & O_NONBLOCK) { /* get outa here soon */ fuse_lck_mtx_unlock(data->ms_mtx); return (EAGAIN); } else { err = msleep(data, &data->ms_mtx, PCATCH, "fu_msg", 0); if (err != 0) { fuse_lck_mtx_unlock(data->ms_mtx); return (fdata_get_dead(data) ? ENODEV : err); } tick = fuse_ms_pop(data); } } if (!tick) { /* * We can get here if fuse daemon suddenly terminates, * eg, by being hit by a SIGKILL * -- and some other cases, too, tho not totally clear, when * (cv_signal/wakeup_one signals the whole process ?) */ SDT_PROBE2(fuse, , device, trace, 1, "no message on thread"); goto again; } fuse_lck_mtx_unlock(data->ms_mtx); if (fdata_get_dead(data)) { /* * somebody somewhere -- eg., umount routine -- * wants this liaison finished off */ SDT_PROBE2(fuse, , device, trace, 2, "reader is to be sacked"); if (tick) { SDT_PROBE2(fuse, , device, trace, 2, "weird -- " "\"kick\" is set tho there is message"); FUSE_ASSERT_MS_DONE(tick); fuse_ticket_drop(tick); } return (ENODEV); /* This should make the daemon get off * of us */ } SDT_PROBE2(fuse, , device, trace, 1, "fuse device read message successfully"); KASSERT(tick->tk_ms_bufdata || tick->tk_ms_bufsize == 0, ("non-null buf pointer with positive size")); switch (tick->tk_ms_type) { case FT_M_FIOV: buf[0] = tick->tk_ms_fiov.base; buflen[0] = tick->tk_ms_fiov.len; break; case FT_M_BUF: buf[0] = tick->tk_ms_fiov.base; buflen[0] = tick->tk_ms_fiov.len; buf[1] = tick->tk_ms_bufdata; buflen[1] = tick->tk_ms_bufsize; break; default: panic("unknown message type for fuse_ticket %p", tick); } for (i = 0; buf[i]; i++) { /* * Why not ban mercilessly stupid daemons who can't keep up * with us? (There is no much use of a partial read here...) */ /* * XXX note that in such cases Linux FUSE throws EIO at the * syscall invoker and stands back to the message queue. The * rationale should be made clear (and possibly adopt that * behaviour). Keeping the current scheme at least makes * fallacy as loud as possible... */ if (uio->uio_resid < buflen[i]) { fdata_set_dead(data); SDT_PROBE2(fuse, , device, trace, 2, "daemon is stupid, kick it off..."); err = ENODEV; break; } err = uiomove(buf[i], buflen[i], uio); if (err) break; } FUSE_ASSERT_MS_DONE(tick); fuse_ticket_drop(tick); return (err); } static inline int fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio) { if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) { SDT_PROBE2(fuse, , device, trace, 1, "Format error: body size " "differs from size claimed by header"); return (EINVAL); } if (uio->uio_resid && ohead->error) { SDT_PROBE2(fuse, , device, trace, 1, "Format error: non zero error but message had a body"); return (EINVAL); } /* Sanitize the linuxism of negative errnos */ ohead->error = -(ohead->error); return (0); } -SDT_PROBE_DEFINE1(fuse, , device, fuse_device_write_bumped_into_callback, - "uint64_t"); +SDT_PROBE_DEFINE1(fuse, , device, fuse_device_write_missing_ticket, "uint64_t"); +SDT_PROBE_DEFINE1(fuse, , device, fuse_device_write_found, "struct fuse_ticket*"); /* * fuse_device_write first reads the header sent by the daemon. * If that's OK, looks up ticket/callback node by the unique id seen in header. * If the callback node contains a handler function, the uio is passed over * that. */ static int fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) { struct fuse_out_header ohead; int err = 0; struct fuse_data *data; struct fuse_ticket *tick, *x_tick; int found = 0; err = devfs_get_cdevpriv((void **)&data); if (err != 0) return (err); if (uio->uio_resid < sizeof(struct fuse_out_header)) { SDT_PROBE2(fuse, , device, trace, 1, "fuse_device_write got less than a header!"); fdata_set_dead(data); return (EINVAL); } if ((err = uiomove(&ohead, sizeof(struct fuse_out_header), uio)) != 0) return (err); /* * We check header information (which is redundant) and compare it * with what we see. If we see some inconsistency we discard the * whole answer and proceed on as if it had never existed. In * particular, no pretender will be woken up, regardless the * "unique" value in the header. */ if ((err = fuse_ohead_audit(&ohead, uio))) { fdata_set_dead(data); return (err); } /* Pass stuff over to callback if there is one installed */ /* Looking for ticket with the unique id of header */ fuse_lck_mtx_lock(data->aw_mtx); TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link, x_tick) { - SDT_PROBE1(fuse, , device, - fuse_device_write_bumped_into_callback, - tick->tk_unique); if (tick->tk_unique == ohead.unique) { + SDT_PROBE1(fuse, , device, fuse_device_write_found, + tick); found = 1; fuse_aw_remove(tick); break; } } fuse_lck_mtx_unlock(data->aw_mtx); if (found) { if (tick->tk_aw_handler) { /* * We found a callback with proper handler. In this * case the out header will be 0wnd by the callback, * so the fun of freeing that is left for her. * (Then, by all chance, she'll just get that's done * via ticket_drop(), so no manual mucking * around...) */ SDT_PROBE2(fuse, , device, trace, 1, "pass ticket to a callback"); memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead)); err = tick->tk_aw_handler(tick, uio); } else { /* pretender doesn't wanna do anything with answer */ SDT_PROBE2(fuse, , device, trace, 1, "stuff devalidated, so we drop it"); } /* * As aw_mtx was not held during the callback execution the * ticket may have been inserted again. However, this is safe * because fuse_ticket_drop() will deal with refcount anyway. */ fuse_ticket_drop(tick); } else { /* no callback at all! */ - SDT_PROBE2(fuse, , device, trace, 1, - "erhm, no handler for this response"); + SDT_PROBE1(fuse, , device, fuse_device_write_missing_ticket, + ohead.unique); err = EINVAL; } return (err); } int fuse_device_init(void) { fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, "fuse"); if (fuse_dev == NULL) return (ENOMEM); return (0); } void fuse_device_destroy(void) { MPASS(fuse_dev != NULL); destroy_dev(fuse_dev); } Index: projects/fuse2/sys/fs/fuse/fuse_io.c =================================================================== --- projects/fuse2/sys/fs/fuse/fuse_io.c (revision 346338) +++ projects/fuse2/sys/fs/fuse/fuse_io.c (revision 346339) @@ -1,891 +1,908 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_node.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_io.h" SDT_PROVIDER_DECLARE(fuse); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fuse, , io, trace, "int", "char*"); static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh); static int fuse_read_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid); static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid); SDT_PROBE_DEFINE5(fuse, , io, io_dispatch, "struct vnode*", "struct uio*", "int", "struct ucred*", "struct fuse_filehandle*"); int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred, pid_t pid) { struct fuse_filehandle *fufh; int err, directio; int fflag; MPASS(vp->v_type == VREG || vp->v_type == VDIR); fflag = (uio->uio_rw == UIO_READ) ? FREAD : FWRITE; err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); if (err) { printf("FUSE: io dispatch: filehandles are closed\n"); return err; } SDT_PROBE5(fuse, , io, io_dispatch, vp, uio, ioflag, cred, fufh); /* * Ideally, when the daemon asks for direct io at open time, the * standard file flag should be set according to this, so that would * just change the default mode, which later on could be changed via * fcntl(2). * But this doesn't work, the O_DIRECT flag gets cleared at some point * (don't know where). So to make any use of the Fuse direct_io option, * we hardwire it into the file's private data (similarly to Linux, * btw.). */ directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); switch (uio->uio_rw) { case UIO_READ: if (directio) { SDT_PROBE2(fuse, , io, trace, 1, "direct read of vnode"); err = fuse_read_directbackend(vp, uio, cred, fufh); } else { SDT_PROBE2(fuse, , io, trace, 1, "buffered read of vnode"); err = fuse_read_biobackend(vp, uio, cred, fufh, pid); } break; case UIO_WRITE: /* * Kludge: simulate write-through caching via write-around * caching. Same effect, as far as never caching dirty data, * but slightly pessimal in that newly written data is not * cached. */ if (directio || fuse_data_cache_mode == FUSE_CACHE_WT) { off_t start, end; SDT_PROBE2(fuse, , io, trace, 1, "direct write of vnode"); start = uio->uio_offset; end = start + uio->uio_resid; v_inval_buf_range(vp, start, end, fuse_iosize(vp)); err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag); } else { SDT_PROBE2(fuse, , io, trace, 1, "buffered write of vnode"); err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag, pid); } break; default: panic("uninterpreted mode passed to fuse_io_dispatch"); } return (err); } SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_start, "int", "int", "int"); SDT_PROBE_DEFINE2(fuse, , io, read_bio_backend_feed, "int", "int"); SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_end, "int", "ssize_t", "int"); static int fuse_read_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid) { struct buf *bp; daddr_t lbn; int bcount; int err = 0, n = 0, on = 0; off_t filesize; const int biosize = fuse_iosize(vp); if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); bcount = biosize; filesize = VTOFUD(vp)->filesize; do { if (fuse_isdeadfs(vp)) { err = ENXIO; break; } lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); SDT_PROBE3(fuse, , io, read_bio_backend_start, biosize, (int)lbn, on); /* * Obtain the buffer cache block. Figure out the buffer size * when we are at EOF. If we are modifying the size of the * buffer based on an EOF condition we need to hold * nfs_rslock() through obtaining the buffer to prevent * a potential writer-appender from messing with n_size. * Otherwise we may accidentally truncate the buffer and * lose dirty data. * * Note that bcount is *not* DEV_BSIZE aligned. */ if ((off_t)lbn * biosize >= filesize) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { bcount = filesize - (off_t)lbn *biosize; } bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); if (!bp) return (EINTR); /* * If B_CACHE is not set, we must issue the read. If this * fails, we return an error. */ if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); err = fuse_io_strategy(vp, bp); if (err) { brelse(bp); return (err); } } /* * on is the offset into the current bp. Figure out how many * bytes we can copy out of the bp. Note that bcount is * NOT DEV_BSIZE aligned. * * Then figure out how many bytes we can copy into the uio. */ n = 0; if (on < bcount) n = MIN((unsigned)(bcount - on), uio->uio_resid); if (n > 0) { SDT_PROBE2(fuse, , io, read_bio_backend_feed, n, n + (int)bp->b_resid); err = uiomove(bp->b_data + on, n, uio); } brelse(bp); SDT_PROBE3(fuse, , io, read_bio_backend_end, err, uio->uio_resid, n); } while (err == 0 && uio->uio_resid > 0 && n > 0); return (err); } SDT_PROBE_DEFINE1(fuse, , io, read_directbackend_start, "struct fuse_read_in*"); SDT_PROBE_DEFINE2(fuse, , io, read_directbackend_complete, "struct fuse_dispatcher*", "struct uio*"); static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh) { struct fuse_dispatcher fdi; struct fuse_read_in *fri; int err = 0; if (uio->uio_resid == 0) return (0); fdisp_init(&fdi, 0); /* * XXX In "normal" case we use an intermediate kernel buffer for * transmitting data from daemon's context to ours. Eventually, we should * get rid of this. Anyway, if the target uio lives in sysspace (we are * called from pageops), and the input data doesn't need kernel-side * processing (we are not called from readdir) we can already invoke * an optimized, "peer-to-peer" I/O routine. */ while (uio->uio_resid > 0) { fdi.iosize = sizeof(*fri); fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); fri = fdi.indata; fri->fh = fufh->fh_id; fri->offset = uio->uio_offset; fri->size = MIN(uio->uio_resid, fuse_get_mpdata(vp->v_mount)->max_read); SDT_PROBE1(fuse, , io, read_directbackend_start, fri); if ((err = fdisp_wait_answ(&fdi))) goto out; SDT_PROBE2(fuse, , io, read_directbackend_complete, fdi.iosize, uio); if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) break; if (fdi.iosize < fri->size) break; } out: fdisp_destroy(&fdi); return (err); } static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_write_in *fwi; struct fuse_write_out *fwo; struct fuse_dispatcher fdi; size_t chunksize; void *fwi_data; off_t as_written_offset; int diff; int err = 0; bool direct_io = fufh->fuse_open_flags & FOPEN_DIRECT_IO; if (uio->uio_resid == 0) return (0); if (ioflag & IO_APPEND) uio_setoffset(uio, fvdat->filesize); fdisp_init(&fdi, 0); while (uio->uio_resid > 0) { chunksize = MIN(uio->uio_resid, fuse_get_mpdata(vp->v_mount)->max_write); fdi.iosize = sizeof(*fwi) + chunksize; fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); fwi = fdi.indata; fwi->fh = fufh->fh_id; fwi->offset = uio->uio_offset; fwi->size = chunksize; fwi_data = (char *)fdi.indata + sizeof(*fwi); if ((err = uiomove(fwi_data, chunksize, uio))) break; retry: - if ((err = fdisp_wait_answ(&fdi))) + err = fdisp_wait_answ(&fdi); + if (err == ERESTART || err == EINTR || err == EWOULDBLOCK) { + /* + * Rewind the uio so dofilewrite will know it's + * incomplete + */ + uio->uio_resid += fwi->size; + uio->uio_offset -= fwi->size; + /* + * Change ERESTART into EINTR because we can't rewind + * uio->uio_iov. Basically, once uiomove(9) has been + * called, it's impossible to restart a syscall. + */ + if (err == ERESTART) + err = EINTR; break; + } else if (err) { + break; + } fwo = ((struct fuse_write_out *)fdi.answ); /* Adjust the uio in the case of short writes */ diff = fwi->size - fwo->size; as_written_offset = uio->uio_offset - diff; if (as_written_offset - diff > fvdat->filesize && fuse_data_cache_mode != FUSE_CACHE_UC) { fuse_vnode_setsize(vp, cred, as_written_offset); fvdat->flag &= ~FN_SIZECHANGE; } if (diff < 0) { printf("WARNING: misbehaving FUSE filesystem " "wrote more data than we provided it\n"); err = EINVAL; break; } else if (diff > 0) { /* Short write */ if (!direct_io) { printf("WARNING: misbehaving FUSE filesystem: " "short writes are only allowed with " "direct_io\n"); } if (ioflag & IO_DIRECT) { /* Return early */ uio->uio_resid += diff; uio->uio_offset -= diff; break; } else { /* Resend the unwritten portion of data */ fdi.iosize = sizeof(*fwi) + diff; /* Refresh fdi without clearing data buffer */ fdisp_refresh_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); fwi = fdi.indata; MPASS2(fwi == fdi.indata, "FUSE dispatcher " "reallocated despite no increase in " "size?"); void *src = (char*)fwi_data + fwo->size; memmove(fwi_data, src, diff); fwi->fh = fufh->fh_id; fwi->offset = as_written_offset; fwi->size = diff; goto retry; } } } fdisp_destroy(&fdi); return (err); } SDT_PROBE_DEFINE6(fuse, , io, write_biobackend_start, "int64_t", "int", "int", "struct uio*", "int", "bool"); SDT_PROBE_DEFINE2(fuse, , io, write_biobackend_append_race, "long", "int"); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct buf *bp; daddr_t lbn; int bcount; int n, on, err = 0; const int biosize = fuse_iosize(vp); KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); if (vp->v_type != VREG) return (EIO); if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (0); if (ioflag & IO_APPEND) uio_setoffset(uio, fvdat->filesize); /* * Find all of this file's B_NEEDCOMMIT buffers. If our writes * would exceed the local maximum per-file write commit size when * combined with those, we must decide whether to flush, * go synchronous, or return err. We don't bother checking * IO_UNIT -- we just make all writes atomic anyway, as there's * no point optimizing for something that really won't ever happen. */ do { if (fuse_isdeadfs(vp)) { err = ENXIO; break; } lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); n = MIN((unsigned)(biosize - on), uio->uio_resid); again: /* * Handle direct append and file extension cases, calculate * unaligned buffer size. */ if (uio->uio_offset == fvdat->filesize && n) { /* * Get the buffer (in its pre-append state to maintain * B_CACHE if it was previously set). Resize the * nfsnode after we have locked the buffer to prevent * readers from reading garbage. */ bcount = on; SDT_PROBE6(fuse, , io, write_biobackend_start, lbn, on, n, uio, bcount, true); bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); if (bp != NULL) { long save; err = fuse_vnode_setsize(vp, cred, uio->uio_offset + n); if (err) { brelse(bp); break; } save = bp->b_flags & B_CACHE; bcount += n; allocbuf(bp, bcount); bp->b_flags |= save; } } else { /* * Obtain the locked cache block first, and then * adjust the file's size as appropriate. */ bcount = on + n; if ((off_t)lbn * biosize + bcount < fvdat->filesize) { if ((off_t)(lbn + 1) * biosize < fvdat->filesize) bcount = biosize; else bcount = fvdat->filesize - (off_t)lbn *biosize; } SDT_PROBE6(fuse, , io, write_biobackend_start, lbn, on, n, uio, bcount, false); bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); if (bp && uio->uio_offset + n > fvdat->filesize) { err = fuse_vnode_setsize(vp, cred, uio->uio_offset + n); if (err) { brelse(bp); break; } } } if (!bp) { err = EINTR; break; } /* * Issue a READ if B_CACHE is not set. In special-append * mode, B_CACHE is based on the buffer prior to the write * op and is typically set, avoiding the read. If a read * is required in special append mode, the server will * probably send us a short-read since we extended the file * on our end, resulting in b_resid == 0 and, thusly, * B_CACHE getting set. * * We can also avoid issuing the read if the write covers * the entire buffer. We have to make sure the buffer state * is reasonable in this case since we will not be initiating * I/O. See the comments in kern/vfs_bio.c's getblk() for * more information. * * B_CACHE may also be set due to the buffer being cached * normally. */ if (on == 0 && n == bcount) { bp->b_flags |= B_CACHE; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; } if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); fuse_io_strategy(vp, bp); if ((err = bp->b_error)) { brelse(bp); break; } } if (bp->b_wcred == NOCRED) bp->b_wcred = crhold(cred); /* * If dirtyend exceeds file size, chop it down. This should * not normally occur but there is an append race where it * might occur XXX, so we log it. * * If the chopping creates a reverse-indexed or degenerate * situation with dirtyoff/end, we 0 both of them. */ if (bp->b_dirtyend > bcount) { SDT_PROBE2(fuse, , io, write_biobackend_append_race, (long)bp->b_blkno * biosize, bp->b_dirtyend - bcount); bp->b_dirtyend = bcount; } if (bp->b_dirtyoff >= bp->b_dirtyend) bp->b_dirtyoff = bp->b_dirtyend = 0; /* * If the new write will leave a contiguous dirty * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. * * While it is possible to merge discontiguous writes due to * our having a B_CACHE buffer ( and thus valid read data * for the hole), we don't because it could lead to * significant cache coherency problems with multiple clients, * especially if locking is implemented later on. * * as an optimization we could theoretically maintain * a linked list of discontinuous areas, but we would still * have to commit them separately so there isn't much * advantage to it except perhaps a bit of asynchronization. */ if (bp->b_dirtyend > 0 && (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { /* * Yes, we mean it. Write out everything to "storage" * immediately, without hesitation. (Apart from other * reasons: the only way to know if a write is valid * if its actually written out.) */ bwrite(bp); if (bp->b_error == EINTR) { err = EINTR; break; } goto again; } err = uiomove((char *)bp->b_data + on, n, uio); /* * Since this block is being modified, it must be written * again and not just committed. Since write clustering does * not work for the stage 1 data write, only the stage 2 * commit rpc, we have to clear B_CLUSTEROK as well. */ bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); if (err) { bp->b_ioflags |= BIO_ERROR; bp->b_error = err; brelse(bp); break; } /* * Only update dirtyoff/dirtyend if not a degenerate * condition. */ if (n) { if (bp->b_dirtyend > 0) { bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); } else { bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } vfs_bio_set_valid(bp, on, n); } err = bwrite(bp); if (err) break; } while (uio->uio_resid > 0 && n > 0); if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0) fuse_vnode_savesize(vp, cred, pid); return (err); } int fuse_io_strategy(struct vnode *vp, struct buf *bp) { struct fuse_filehandle *fufh; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct ucred *cred; struct uio *uiop; struct uio uio; struct iovec io; int error = 0; int fflag; /* We don't know the true pid when we're dealing with the cache */ pid_t pid = 0; const int biosize = fuse_iosize(vp); MPASS(vp->v_type == VREG || vp->v_type == VDIR); MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); fflag = bp->b_iocmd == BIO_READ ? FREAD : FWRITE; cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; error = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); if (bp->b_iocmd == BIO_READ && error == EBADF) { /* * This may be a read-modify-write operation on a cached file * opened O_WRONLY. The FUSE protocol allows this. * * TODO: eliminate this hacky check once the FUFH table is gone */ error = fuse_filehandle_get(vp, FWRITE, &fufh, cred, pid); } if (error) { printf("FUSE: strategy: filehandles are closed\n"); bp->b_ioflags |= BIO_ERROR; bp->b_error = error; bufdone(bp); return (error); } uiop = &uio; uiop->uio_iov = &io; uiop->uio_iovcnt = 1; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = curthread; /* * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We * do this here so we do not have to do it in all the code that * calls us. */ bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; KASSERT(!(bp->b_flags & B_DONE), ("fuse_io_strategy: bp %p already marked done", bp)); if (bp->b_iocmd == BIO_READ) { io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; error = fuse_read_directbackend(vp, uiop, cred, fufh); /* XXXCEM: Potentially invalid access to cached_attrs here */ if ((!error && uiop->uio_resid) || (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO && uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 && uiop->uio_offset >= fvdat->cached_attrs.va_size)) { /* * If we had a short read with no error, we must have * hit a file hole. We should zero-fill the remainder. * This can also occur if the server hits the file EOF. * * Holes used to be able to occur due to pending * writes, but that is not possible any longer. */ int nread = bp->b_bcount - uiop->uio_resid; int left = uiop->uio_resid; if (error != 0) { printf("FUSE: Fix broken io: offset %ju, " " resid %zd, file size %ju/%ju\n", (uintmax_t)uiop->uio_offset, uiop->uio_resid, fvdat->filesize, fvdat->cached_attrs.va_size); error = 0; } if (left > 0) bzero((char *)bp->b_data + nread, left); uiop->uio_resid = 0; } if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } } else { /* * If we only need to commit, try to commit */ if (bp->b_flags & B_NEEDCOMMIT) { SDT_PROBE2(fuse, , io, trace, 1, "write: B_NEEDCOMMIT flags set"); } /* * Setup for actual write */ if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > fvdat->filesize) bp->b_dirtyend = fvdat->filesize - (off_t)bp->b_blkno * biosize; if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; uiop->uio_offset = (off_t)bp->b_blkno * biosize + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; error = fuse_write_directbackend(vp, uiop, cred, fufh, 0); if (error == EINTR || error == ETIMEDOUT || (!error && (bp->b_flags & B_NEEDCOMMIT))) { bp->b_flags &= ~(B_INVAL | B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); bp->b_flags &= ~B_DONE; } if ((error == EINTR || error == ETIMEDOUT) && (bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; } else { if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_flags |= B_INVAL; bp->b_error = error; } bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; bufdone(bp); return (0); } } bp->b_resid = uiop->uio_resid; bufdone(bp); return (error); } int fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td) { struct vop_fsync_args a = { .a_vp = vp, .a_waitfor = waitfor, .a_td = td, }; return (vop_stdfsync(&a)); } /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. */ int fuse_io_invalbuf(struct vnode *vp, struct thread *td) { struct fuse_vnode_data *fvdat = VTOFUD(vp); int error = 0; if (vp->v_iflag & VI_DOOMED) return 0; ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf"); while (fvdat->flag & FN_FLUSHINPROG) { struct proc *p = td->td_proc; if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) return EIO; fvdat->flag |= FN_FLUSHWANT; tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz); error = 0; if (p != NULL) { PROC_LOCK(p); if (SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist)) error = EINTR; PROC_UNLOCK(p); } if (error == EINTR) return EINTR; } fvdat->flag |= FN_FLUSHINPROG; if (vp->v_bufobj.bo_object != NULL) { VM_OBJECT_WLOCK(vp->v_bufobj.bo_object); vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object); } error = vinvalbuf(vp, V_SAVE, PCATCH, 0); while (error) { if (error == ERESTART || error == EINTR) { fvdat->flag &= ~FN_FLUSHINPROG; if (fvdat->flag & FN_FLUSHWANT) { fvdat->flag &= ~FN_FLUSHWANT; wakeup(&fvdat->flag); } return EINTR; } error = vinvalbuf(vp, V_SAVE, PCATCH, 0); } fvdat->flag &= ~FN_FLUSHINPROG; if (fvdat->flag & FN_FLUSHWANT) { fvdat->flag &= ~FN_FLUSHWANT; wakeup(&fvdat->flag); } return (error); } Index: projects/fuse2/sys/fs/fuse/fuse_ipc.c =================================================================== --- projects/fuse2/sys/fs/fuse/fuse_ipc.c (revision 346338) +++ projects/fuse2/sys/fs/fuse/fuse_ipc.c (revision 346339) @@ -1,896 +1,996 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_node.h" #include "fuse_ipc.h" #include "fuse_internal.h" SDT_PROVIDER_DECLARE(fuse); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fuse, , ipc, trace, "int", "char*"); +static void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, + struct fuse_data *data, uint64_t nid, pid_t pid, struct ucred *cred); static void fiov_clear(struct fuse_iov *fiov); +static void fuse_interrupt_send(struct fuse_ticket *otick); static struct fuse_ticket *fticket_alloc(struct fuse_data *data); static void fticket_refresh(struct fuse_ticket *ftick); static void fticket_destroy(struct fuse_ticket *ftick); static int fticket_wait_answer(struct fuse_ticket *ftick); static inline int fticket_aw_pull_uio(struct fuse_ticket *ftick, struct uio *uio); static int fuse_body_audit(struct fuse_ticket *ftick, size_t blen); static fuse_handler_t fuse_standard_handler; SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW, 0, "FUSE tunables"); SYSCTL_STRING(_vfs_fusefs, OID_AUTO, version, CTLFLAG_RD, FUSE_FREEBSD_VERSION, 0, "fuse-freebsd version"); static int fuse_ticket_count = 0; SYSCTL_INT(_vfs_fusefs, OID_AUTO, ticket_count, CTLFLAG_RW, &fuse_ticket_count, 0, "number of allocated tickets"); static long fuse_iov_permanent_bufsize = 1 << 19; SYSCTL_LONG(_vfs_fusefs, OID_AUTO, iov_permanent_bufsize, CTLFLAG_RW, &fuse_iov_permanent_bufsize, 0, "limit for permanently stored buffer size for fuse_iovs"); static int fuse_iov_credit = 16; SYSCTL_INT(_vfs_fusefs, OID_AUTO, iov_credit, CTLFLAG_RW, &fuse_iov_credit, 0, "how many times is an oversized fuse_iov tolerated"); MALLOC_DEFINE(M_FUSEMSG, "fuse_msgbuf", "fuse message buffer"); static uma_zone_t ticket_zone; -static void -fuse_block_sigs(sigset_t *oldset) +/* + * TODO: figure out how to timeout INTERRUPT requests, because the daemon may + * leagally never respond + * + * TODO: remove an INTERRUPT request if the daemon responds to the original + */ +static int +fuse_interrupt_callback(struct fuse_ticket *tick, struct uio *uio) { - sigset_t newset; + if (tick->tk_aw_ohead.error == EAGAIN) { + /* + * There are two reasons we might get this: + * 1) the daemon received the INTERRUPT request before the + * original, or + * 2) the daemon received the INTERRUPT request after it + * completed the original request. + * In the first case we should re-send the INTERRUPT. In the + * second, we should ignore it. + */ + struct fuse_interrupt_in *fii; + struct fuse_data *data; + struct fuse_ticket *otick, *x_tick; + bool found = false; - SIGFILLSET(newset); - SIGDELSET(newset, SIGKILL); - if (kern_sigprocmask(curthread, SIG_BLOCK, &newset, oldset, 0)) - panic("%s: Invalid operation for kern_sigprocmask()", - __func__); + data = tick->tk_data; + fii = (struct fuse_interrupt_in*)((char*)tick->tk_ms_fiov.base + + sizeof(struct fuse_in_header)); + fuse_lck_mtx_lock(data->aw_mtx); + TAILQ_FOREACH_SAFE(otick, &data->aw_head, tk_aw_link, x_tick) { + if (otick->tk_unique == fii->unique) { + found = true; + break; + } + } + fuse_lck_mtx_unlock(data->aw_mtx); + if (found) { + /* Resend */ + fuse_interrupt_send(otick); + } else { + /* Original is already complete; nothing to do */ + } + return 0; + } else { + /* Illegal FUSE_INTERRUPT response */ + return EINVAL; + } } -static void -fuse_restore_sigs(sigset_t *oldset) +void +fuse_interrupt_send(struct fuse_ticket *otick) { + struct fuse_dispatcher fdi; + struct fuse_interrupt_in *fii; + struct fuse_in_header *ftick_hdr; + struct fuse_data *data = otick->tk_data; + struct ucred reused_creds; - if (kern_sigprocmask(curthread, SIG_SETMASK, oldset, NULL, 0)) - panic("%s: Invalid operation for kern_sigprocmask()", - __func__); + ftick_hdr = fticket_in_header(otick); + reused_creds.cr_uid = ftick_hdr->uid; + reused_creds.cr_rgid = ftick_hdr->gid; + fdisp_init(&fdi, sizeof(*fii)); + fdisp_make_pid(&fdi, FUSE_INTERRUPT, data, ftick_hdr->nodeid, + ftick_hdr->pid, &reused_creds); + + fii = fdi.indata; + fii->unique = otick->tk_unique; + fuse_insert_callback(fdi.tick, fuse_interrupt_callback); + + fuse_insert_message(fdi.tick); + fdisp_destroy(&fdi); } void fiov_init(struct fuse_iov *fiov, size_t size) { uint32_t msize = FU_AT_LEAST(size); fiov->len = 0; fiov->base = malloc(msize, M_FUSEMSG, M_WAITOK | M_ZERO); fiov->allocated_size = msize; fiov->credit = fuse_iov_credit; } void fiov_teardown(struct fuse_iov *fiov) { MPASS(fiov->base != NULL); free(fiov->base, M_FUSEMSG); } void fiov_adjust(struct fuse_iov *fiov, size_t size) { if (fiov->allocated_size < size || (fuse_iov_permanent_bufsize >= 0 && fiov->allocated_size - size > fuse_iov_permanent_bufsize && --fiov->credit < 0)) { fiov->base = realloc(fiov->base, FU_AT_LEAST(size), M_FUSEMSG, M_WAITOK | M_ZERO); if (!fiov->base) { panic("FUSE: realloc failed"); } fiov->allocated_size = FU_AT_LEAST(size); fiov->credit = fuse_iov_credit; /* Clear data buffer after reallocation */ bzero(fiov->base, size); } else if (size > fiov->len) { /* Clear newly extended portion of data buffer */ bzero((char*)fiov->base + fiov->len, size - fiov->len); } fiov->len = size; } /* Clear the fiov's data buffer */ static void fiov_clear(struct fuse_iov *fiov) { bzero(fiov->base, fiov->len); } /* Resize the fiov if needed, and clear it's buffer */ void fiov_refresh(struct fuse_iov *fiov) { fiov_adjust(fiov, 0); } static int fticket_ctor(void *mem, int size, void *arg, int flags) { struct fuse_ticket *ftick = mem; struct fuse_data *data = arg; FUSE_ASSERT_MS_DONE(ftick); FUSE_ASSERT_AW_DONE(ftick); ftick->tk_data = data; if (ftick->tk_unique != 0) fticket_refresh(ftick); /* May be truncated to 32 bits */ ftick->tk_unique = atomic_fetchadd_long(&data->ticketer, 1); if (ftick->tk_unique == 0) ftick->tk_unique = atomic_fetchadd_long(&data->ticketer, 1); refcount_init(&ftick->tk_refcount, 1); atomic_add_acq_int(&fuse_ticket_count, 1); return 0; } static void fticket_dtor(void *mem, int size, void *arg) { #ifdef INVARIANTS struct fuse_ticket *ftick = mem; #endif FUSE_ASSERT_MS_DONE(ftick); FUSE_ASSERT_AW_DONE(ftick); atomic_subtract_acq_int(&fuse_ticket_count, 1); } static int fticket_init(void *mem, int size, int flags) { struct fuse_ticket *ftick = mem; bzero(ftick, sizeof(struct fuse_ticket)); fiov_init(&ftick->tk_ms_fiov, sizeof(struct fuse_in_header)); ftick->tk_ms_type = FT_M_FIOV; mtx_init(&ftick->tk_aw_mtx, "fuse answer delivery mutex", NULL, MTX_DEF); fiov_init(&ftick->tk_aw_fiov, 0); ftick->tk_aw_type = FT_A_FIOV; return 0; } static void fticket_fini(void *mem, int size) { struct fuse_ticket *ftick = mem; fiov_teardown(&ftick->tk_ms_fiov); fiov_teardown(&ftick->tk_aw_fiov); mtx_destroy(&ftick->tk_aw_mtx); } static inline struct fuse_ticket * fticket_alloc(struct fuse_data *data) { return uma_zalloc_arg(ticket_zone, data, M_WAITOK); } static inline void fticket_destroy(struct fuse_ticket *ftick) { return uma_zfree(ticket_zone, ftick); } static inline void fticket_refresh(struct fuse_ticket *ftick) { FUSE_ASSERT_MS_DONE(ftick); FUSE_ASSERT_AW_DONE(ftick); fiov_refresh(&ftick->tk_ms_fiov); ftick->tk_ms_bufdata = NULL; ftick->tk_ms_bufsize = 0; ftick->tk_ms_type = FT_M_FIOV; bzero(&ftick->tk_aw_ohead, sizeof(struct fuse_out_header)); fiov_refresh(&ftick->tk_aw_fiov); ftick->tk_aw_errno = 0; ftick->tk_aw_bufdata = NULL; ftick->tk_aw_bufsize = 0; ftick->tk_aw_type = FT_A_FIOV; ftick->tk_flag = 0; } /* Prepar the ticket to be reused, but don't clear its data buffers */ static inline void fticket_reset(struct fuse_ticket *ftick) { FUSE_ASSERT_MS_DONE(ftick); FUSE_ASSERT_AW_DONE(ftick); ftick->tk_ms_bufdata = NULL; ftick->tk_ms_bufsize = 0; ftick->tk_ms_type = FT_M_FIOV; bzero(&ftick->tk_aw_ohead, sizeof(struct fuse_out_header)); ftick->tk_aw_errno = 0; ftick->tk_aw_bufdata = NULL; ftick->tk_aw_bufsize = 0; ftick->tk_aw_type = FT_A_FIOV; ftick->tk_flag = 0; } static int fticket_wait_answer(struct fuse_ticket *ftick) { - sigset_t tset; + struct thread *td = curthread; + sigset_t blockedset, oldset; int err = 0; struct fuse_data *data; fuse_lck_mtx_lock(ftick->tk_aw_mtx); + SIGEMPTYSET(blockedset); + kern_sigprocmask(td, SIG_BLOCK, &blockedset, &oldset, 0); +retry: if (fticket_answered(ftick)) { goto out; } data = ftick->tk_data; if (fdata_get_dead(data)) { err = ENOTCONN; fticket_set_answered(ftick); goto out; } - fuse_block_sigs(&tset); err = msleep(ftick, &ftick->tk_aw_mtx, PCATCH, "fu_ans", data->daemon_timeout * hz); - fuse_restore_sigs(&tset); - if (err == EAGAIN) { /* same as EWOULDBLOCK */ + kern_sigprocmask(td, SIG_SETMASK, &oldset, NULL, 0); + if (err == EWOULDBLOCK) { + SDT_PROBE2(fuse, , ipc, trace, 3, + "fticket_wait_answer: EWOULDBLOCK"); #ifdef XXXIP /* die conditionally */ if (!fdata_get_dead(data)) { fdata_set_dead(data); } #endif err = ETIMEDOUT; fticket_set_answered(ftick); + } else if ((err == EINTR || err == ERESTART)) { + /* + * Whether we get EINTR or ERESTART depends on whether + * SA_RESTART was set by sigaction(2). + * + * Try to interrupt the operation and wait for an EINTR response + * to the original operation. If the file system does not + * support FUSE_INTERRUPT, then we'll just wait for it to + * complete like normal. If it does support FUSE_INTERRUPT, + * then it will either respond EINTR to the original operation, + * or EAGAIN to the interrupt. + */ + int sig; + + SDT_PROBE2(fuse, , ipc, trace, 4, + "fticket_wait_answer: interrupt"); + fuse_lck_mtx_unlock(ftick->tk_aw_mtx); + fuse_interrupt_send(ftick); + fuse_lck_mtx_lock(ftick->tk_aw_mtx); + + /* TODO: return, rather than retry, for fatal signals */ + + /* + * Block the just-delivered signal while we wait for an + * interrupt response + */ + PROC_LOCK(td->td_proc); + mtx_lock(&td->td_proc->p_sigacts->ps_mtx); + sig = cursig(td); + mtx_unlock(&td->td_proc->p_sigacts->ps_mtx); + PROC_UNLOCK(td->td_proc); + SIGADDSET(blockedset, sig); + kern_sigprocmask(curthread, SIG_BLOCK, &blockedset, NULL, 0); + goto retry; + } else if (err) { + SDT_PROBE2(fuse, , ipc, trace, 6, + "fticket_wait_answer: other error"); + } else { + SDT_PROBE2(fuse, , ipc, trace, 7, "fticket_wait_answer: OK"); } out: if (!(err || fticket_answered(ftick))) { SDT_PROBE2(fuse, , ipc, trace, 1, "FUSE: requester was woken up but still no answer"); err = ENXIO; } fuse_lck_mtx_unlock(ftick->tk_aw_mtx); return err; } static inline int fticket_aw_pull_uio(struct fuse_ticket *ftick, struct uio *uio) { int err = 0; size_t len = uio_resid(uio); if (len) { switch (ftick->tk_aw_type) { case FT_A_FIOV: fiov_adjust(fticket_resp(ftick), len); err = uiomove(fticket_resp(ftick)->base, len, uio); break; case FT_A_BUF: ftick->tk_aw_bufsize = len; err = uiomove(ftick->tk_aw_bufdata, len, uio); break; default: panic("FUSE: unknown answer type for ticket %p", ftick); } } return err; } int fticket_pull(struct fuse_ticket *ftick, struct uio *uio) { int err = 0; if (ftick->tk_aw_ohead.error) { return 0; } err = fuse_body_audit(ftick, uio_resid(uio)); if (!err) { err = fticket_aw_pull_uio(ftick, uio); } return err; } struct fuse_data * fdata_alloc(struct cdev *fdev, struct ucred *cred) { struct fuse_data *data; data = malloc(sizeof(struct fuse_data), M_FUSEMSG, M_WAITOK | M_ZERO); data->fdev = fdev; mtx_init(&data->ms_mtx, "fuse message list mutex", NULL, MTX_DEF); STAILQ_INIT(&data->ms_head); mtx_init(&data->aw_mtx, "fuse answer list mutex", NULL, MTX_DEF); TAILQ_INIT(&data->aw_head); data->daemoncred = crhold(cred); data->daemon_timeout = FUSE_DEFAULT_DAEMON_TIMEOUT; sx_init(&data->rename_lock, "fuse rename lock"); data->ref = 1; return data; } void fdata_trydestroy(struct fuse_data *data) { data->ref--; MPASS(data->ref >= 0); if (data->ref != 0) return; /* Driving off stage all that stuff thrown at device... */ mtx_destroy(&data->ms_mtx); mtx_destroy(&data->aw_mtx); sx_destroy(&data->rename_lock); crfree(data->daemoncred); free(data, M_FUSEMSG); } void fdata_set_dead(struct fuse_data *data) { FUSE_LOCK(); if (fdata_get_dead(data)) { FUSE_UNLOCK(); return; } fuse_lck_mtx_lock(data->ms_mtx); data->dataflags |= FSESS_DEAD; wakeup_one(data); selwakeuppri(&data->ks_rsel, PZERO + 1); wakeup(&data->ticketer); fuse_lck_mtx_unlock(data->ms_mtx); FUSE_UNLOCK(); } struct fuse_ticket * fuse_ticket_fetch(struct fuse_data *data) { int err = 0; struct fuse_ticket *ftick; ftick = fticket_alloc(data); if (!(data->dataflags & FSESS_INITED)) { /* Sleep until get answer for INIT messsage */ FUSE_LOCK(); if (!(data->dataflags & FSESS_INITED) && data->ticketer > 2) { err = msleep(&data->ticketer, &fuse_mtx, PCATCH | PDROP, "fu_ini", 0); if (err) fdata_set_dead(data); } else FUSE_UNLOCK(); } return ftick; } int fuse_ticket_drop(struct fuse_ticket *ftick) { int die; die = refcount_release(&ftick->tk_refcount); if (die) fticket_destroy(ftick); return die; } void fuse_insert_callback(struct fuse_ticket *ftick, fuse_handler_t * handler) { if (fdata_get_dead(ftick->tk_data)) { return; } ftick->tk_aw_handler = handler; fuse_lck_mtx_lock(ftick->tk_data->aw_mtx); fuse_aw_push(ftick); fuse_lck_mtx_unlock(ftick->tk_data->aw_mtx); } void fuse_insert_message(struct fuse_ticket *ftick) { if (ftick->tk_flag & FT_DIRTY) { panic("FUSE: ticket reused without being refreshed"); } ftick->tk_flag |= FT_DIRTY; if (fdata_get_dead(ftick->tk_data)) { return; } fuse_lck_mtx_lock(ftick->tk_data->ms_mtx); fuse_ms_push(ftick); wakeup_one(ftick->tk_data); selwakeuppri(&ftick->tk_data->ks_rsel, PZERO + 1); fuse_lck_mtx_unlock(ftick->tk_data->ms_mtx); } static int fuse_body_audit(struct fuse_ticket *ftick, size_t blen) { int err = 0; enum fuse_opcode opcode; opcode = fticket_opcode(ftick); switch (opcode) { case FUSE_LOOKUP: err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; break; case FUSE_FORGET: panic("FUSE: a handler has been intalled for FUSE_FORGET"); break; case FUSE_GETATTR: err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL; break; case FUSE_SETATTR: err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL; break; case FUSE_READLINK: err = (PAGE_SIZE >= blen) ? 0 : EINVAL; break; case FUSE_SYMLINK: err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; break; case FUSE_MKNOD: err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; break; case FUSE_MKDIR: err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; break; case FUSE_UNLINK: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_RMDIR: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_RENAME: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_LINK: err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; break; case FUSE_OPEN: err = (blen == sizeof(struct fuse_open_out)) ? 0 : EINVAL; break; case FUSE_READ: err = (((struct fuse_read_in *)( (char *)ftick->tk_ms_fiov.base + sizeof(struct fuse_in_header) ))->size >= blen) ? 0 : EINVAL; break; case FUSE_WRITE: err = (blen == sizeof(struct fuse_write_out)) ? 0 : EINVAL; break; case FUSE_STATFS: if (fuse_libabi_geq(ftick->tk_data, 7, 4)) { err = (blen == sizeof(struct fuse_statfs_out)) ? 0 : EINVAL; } else { err = (blen == FUSE_COMPAT_STATFS_SIZE) ? 0 : EINVAL; } break; case FUSE_RELEASE: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_FSYNC: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_SETXATTR: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_GETXATTR: case FUSE_LISTXATTR: /* * These can have varying response lengths, and 0 length * isn't necessarily invalid. */ err = 0; break; case FUSE_REMOVEXATTR: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_FLUSH: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_INIT: if (blen == sizeof(struct fuse_init_out) || blen == 8) { err = 0; } else { err = EINVAL; } break; case FUSE_OPENDIR: err = (blen == sizeof(struct fuse_open_out)) ? 0 : EINVAL; break; case FUSE_READDIR: err = (((struct fuse_read_in *)( (char *)ftick->tk_ms_fiov.base + sizeof(struct fuse_in_header) ))->size >= blen) ? 0 : EINVAL; break; case FUSE_RELEASEDIR: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_FSYNCDIR: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_GETLK: err = (blen == sizeof(struct fuse_lk_out)) ? 0 : EINVAL; break; case FUSE_SETLK: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_SETLKW: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_ACCESS: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_CREATE: err = (blen == sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)) ? 0 : EINVAL; break; case FUSE_DESTROY: err = (blen == 0) ? 0 : EINVAL; break; default: panic("FUSE: opcodes out of sync (%d)\n", opcode); } return err; } static inline void fuse_setup_ihead(struct fuse_in_header *ihead, struct fuse_ticket *ftick, uint64_t nid, enum fuse_opcode op, size_t blen, pid_t pid, struct ucred *cred) { ihead->len = sizeof(*ihead) + blen; ihead->unique = ftick->tk_unique; ihead->nodeid = nid; ihead->opcode = op; ihead->pid = pid; ihead->uid = cred->cr_uid; ihead->gid = cred->cr_rgid; } /* * fuse_standard_handler just pulls indata and wakes up pretender. * Doesn't try to interpret data, that's left for the pretender. * Though might do a basic size verification before the pull-in takes place */ static int fuse_standard_handler(struct fuse_ticket *ftick, struct uio *uio) { int err = 0; err = fticket_pull(ftick, uio); fuse_lck_mtx_lock(ftick->tk_aw_mtx); if (!fticket_answered(ftick)) { fticket_set_answered(ftick); ftick->tk_aw_errno = err; wakeup(ftick); } fuse_lck_mtx_unlock(ftick->tk_aw_mtx); return err; } /* * Reinitialize a dispatcher from a pid and node id, without resizing or * clearing its data buffers */ static void fdisp_refresh_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred) { MPASS(fdip->tick); MPASS2(sizeof(fdip->finh) + fdip->iosize <= fdip->tick->tk_ms_fiov.len, "Must use fdisp_make_pid to increase the size of the fiov"); fticket_reset(fdip->tick); FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh, fdip->indata, fdip->iosize); fuse_setup_ihead(fdip->finh, fdip->tick, nid, op, fdip->iosize, pid, cred); } /* Initialize a dispatcher from a pid and node id */ static void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, - struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred) + struct fuse_data *data, uint64_t nid, pid_t pid, struct ucred *cred) { - struct fuse_data *data = fuse_get_mpdata(mp); - if (fdip->tick) { fticket_refresh(fdip->tick); } else { fdip->tick = fuse_ticket_fetch(data); } /* FUSE_DIMALLOC will bzero the fiovs when it enlarges them */ FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh, fdip->indata, fdip->iosize); fuse_setup_ihead(fdip->finh, fdip->tick, nid, op, fdip->iosize, pid, cred); } void fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred) { + struct fuse_data *data = fuse_get_mpdata(mp); RECTIFY_TDCR(td, cred); - return fdisp_make_pid(fdip, op, mp, nid, td->td_proc->p_pid, cred); + return fdisp_make_pid(fdip, op, data, nid, td->td_proc->p_pid, cred); } void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred) { + struct mount *mp = vnode_mount(vp); + struct fuse_data *data = fuse_get_mpdata(mp); + RECTIFY_TDCR(td, cred); - return fdisp_make_pid(fdip, op, vnode_mount(vp), VTOI(vp), + return fdisp_make_pid(fdip, op, data, VTOI(vp), td->td_proc->p_pid, cred); } /* Refresh a fuse_dispatcher so it can be reused, but don't zero its data */ void fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred) { RECTIFY_TDCR(td, cred); return fdisp_refresh_pid(fdip, op, vnode_mount(vp), VTOI(vp), td->td_proc->p_pid, cred); } void fdisp_refresh(struct fuse_dispatcher *fdip) { fticket_refresh(fdip->tick); } SDT_PROBE_DEFINE2(fuse, , ipc, fdisp_wait_answ_error, "char*", "int"); int fdisp_wait_answ(struct fuse_dispatcher *fdip) { int err = 0; fdip->answ_stat = 0; fuse_insert_callback(fdip->tick, fuse_standard_handler); fuse_insert_message(fdip->tick); if ((err = fticket_wait_answer(fdip->tick))) { fuse_lck_mtx_lock(fdip->tick->tk_aw_mtx); if (fticket_answered(fdip->tick)) { /* * Just between noticing the interrupt and getting here, * the standard handler has completed his job. * So we drop the ticket and exit as usual. */ SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, "IPC: interrupted, already answered", err); fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx); goto out; } else { /* * So we were faster than the standard handler. * Then by setting the answered flag we get *him* * to drop the ticket. */ SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, "IPC: interrupted, setting to answered", err); fticket_set_answered(fdip->tick); fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx); return err; } } if (fdip->tick->tk_aw_errno) { SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, "IPC: explicit EIO-ing", fdip->tick->tk_aw_errno); err = EIO; goto out; } if ((err = fdip->tick->tk_aw_ohead.error)) { SDT_PROBE2(fuse, , ipc, fdisp_wait_answ_error, "IPC: setting status", fdip->tick->tk_aw_ohead.error); /* * This means a "proper" fuse syscall error. * We record this value so the caller will * be able to know it's not a boring messaging * failure, if she wishes so (and if not, she can * just simply propagate the return value of this routine). * [XXX Maybe a bitflag would do the job too, * if other flags needed, this will be converted thusly.] */ fdip->answ_stat = err; goto out; } fdip->answ = fticket_resp(fdip->tick)->base; fdip->iosize = fticket_resp(fdip->tick)->len; return 0; out: return err; } void fuse_ipc_init(void) { ticket_zone = uma_zcreate("fuse_ticket", sizeof(struct fuse_ticket), fticket_ctor, fticket_dtor, fticket_init, fticket_fini, UMA_ALIGN_PTR, 0); } void fuse_ipc_destroy(void) { uma_zdestroy(ticket_zone); } Index: projects/fuse2/sys/fs/fuse/fuse_ipc.h =================================================================== --- projects/fuse2/sys/fs/fuse/fuse_ipc.h (revision 346338) +++ projects/fuse2/sys/fs/fuse/fuse_ipc.h (revision 346339) @@ -1,399 +1,410 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _FUSE_IPC_H_ #define _FUSE_IPC_H_ #include #include struct fuse_iov { void *base; size_t len; size_t allocated_size; int credit; }; void fiov_init(struct fuse_iov *fiov, size_t size); void fiov_teardown(struct fuse_iov *fiov); void fiov_refresh(struct fuse_iov *fiov); void fiov_adjust(struct fuse_iov *fiov, size_t size); #define FUSE_DIMALLOC(fiov, spc1, spc2, amnt) do { \ fiov_adjust(fiov, (sizeof(*(spc1)) + (amnt))); \ (spc1) = (fiov)->base; \ (spc2) = (char *)(fiov)->base + (sizeof(*(spc1))); \ } while (0) #define FU_AT_LEAST(siz) max((siz), 160) #define FUSE_ASSERT_AW_DONE(ftick) \ KASSERT((ftick)->tk_aw_link.tqe_next == NULL && \ (ftick)->tk_aw_link.tqe_prev == NULL, \ ("FUSE: ticket still on answer delivery list %p", (ftick))) #define FUSE_ASSERT_MS_DONE(ftick) \ KASSERT((ftick)->tk_ms_link.stqe_next == NULL, \ ("FUSE: ticket still on message list %p", (ftick))) struct fuse_ticket; struct fuse_data; typedef int fuse_handler_t(struct fuse_ticket *ftick, struct uio *uio); struct fuse_ticket { /* fields giving the identity of the ticket */ uint64_t tk_unique; struct fuse_data *tk_data; int tk_flag; u_int tk_refcount; /* fields for initiating an upgoing message */ struct fuse_iov tk_ms_fiov; void *tk_ms_bufdata; size_t tk_ms_bufsize; enum { FT_M_FIOV, FT_M_BUF } tk_ms_type; STAILQ_ENTRY(fuse_ticket) tk_ms_link; /* fields for handling answers coming from userspace */ struct fuse_iov tk_aw_fiov; void *tk_aw_bufdata; size_t tk_aw_bufsize; enum { FT_A_FIOV, FT_A_BUF } tk_aw_type; struct fuse_out_header tk_aw_ohead; int tk_aw_errno; struct mtx tk_aw_mtx; fuse_handler_t *tk_aw_handler; TAILQ_ENTRY(fuse_ticket) tk_aw_link; }; #define FT_ANSW 0x01 /* request of ticket has already been answered */ #define FT_DIRTY 0x04 /* ticket has been used */ static inline struct fuse_iov * fticket_resp(struct fuse_ticket *ftick) { return (&ftick->tk_aw_fiov); } static inline bool fticket_answered(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_aw_mtx, MA_OWNED); return (ftick->tk_flag & FT_ANSW); } static inline void fticket_set_answered(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_aw_mtx, MA_OWNED); ftick->tk_flag |= FT_ANSW; } +static inline struct fuse_in_header* +fticket_in_header(struct fuse_ticket *ftick) +{ + return (struct fuse_in_header *)(ftick->tk_ms_fiov.base); +} + static inline enum fuse_opcode fticket_opcode(struct fuse_ticket *ftick) { - return (((struct fuse_in_header *)(ftick->tk_ms_fiov.base))->opcode); + return fticket_in_header(ftick)->opcode; } int fticket_pull(struct fuse_ticket *ftick, struct uio *uio); enum mountpri { FM_NOMOUNTED, FM_PRIMARY, FM_SECONDARY }; /* * The data representing a FUSE session. */ struct fuse_data { struct cdev *fdev; struct mount *mp; struct vnode *vroot; struct ucred *daemoncred; int dataflags; int ref; struct mtx ms_mtx; STAILQ_HEAD(, fuse_ticket) ms_head; struct mtx aw_mtx; TAILQ_HEAD(, fuse_ticket) aw_head; + /* + * Holds the next value of the FUSE operation unique value. + * Also, serves as a wakeup channel to prevent any operations from + * being created before INIT completes. + */ u_long ticketer; struct sx rename_lock; uint32_t fuse_libabi_major; uint32_t fuse_libabi_minor; uint32_t max_write; uint32_t max_read; uint32_t subtype; char volname[MAXPATHLEN]; struct selinfo ks_rsel; int daemon_timeout; uint64_t notimpl; }; #define FSESS_DEAD 0x0001 /* session is to be closed */ #define FSESS_UNUSED0 0x0002 /* unused */ #define FSESS_INITED 0x0004 /* session has been inited */ #define FSESS_DAEMON_CAN_SPY 0x0010 /* let non-owners access this fs */ /* (and being observed by the daemon) */ #define FSESS_PUSH_SYMLINKS_IN 0x0020 /* prefix absolute symlinks with mp */ #define FSESS_DEFAULT_PERMISSIONS 0x0040 /* kernel does permission checking */ #define FSESS_NO_ATTRCACHE 0x0080 /* no attribute caching */ #define FSESS_NO_READAHEAD 0x0100 /* no readaheads */ #define FSESS_NO_DATACACHE 0x0200 /* disable buffer cache */ #define FSESS_NO_NAMECACHE 0x0400 /* disable name cache */ #define FSESS_NO_MMAP 0x0800 /* disable mmap */ #define FSESS_BROKENIO 0x1000 /* fix broken io */ #define FSESS_POSIX_LOCKS 0x2000 /* daemon supports POSIX locks */ enum fuse_data_cache_mode { FUSE_CACHE_UC, FUSE_CACHE_WT, FUSE_CACHE_WB, }; extern int fuse_data_cache_mode; extern int fuse_data_cache_invalidate; extern int fuse_mmap_enable; extern int fuse_sync_resize; extern int fuse_fix_broken_io; static inline struct fuse_data * fuse_get_mpdata(struct mount *mp) { return mp->mnt_data; } static inline bool fsess_isimpl(struct mount *mp, int opcode) { struct fuse_data *data = fuse_get_mpdata(mp); return ((data->notimpl & (1ULL << opcode)) == 0); } static inline void fsess_set_notimpl(struct mount *mp, int opcode) { struct fuse_data *data = fuse_get_mpdata(mp); data->notimpl |= (1ULL << opcode); } static inline bool fsess_opt_datacache(struct mount *mp) { struct fuse_data *data = fuse_get_mpdata(mp); return (fuse_data_cache_mode != FUSE_CACHE_UC && (data->dataflags & FSESS_NO_DATACACHE) == 0); } static inline bool fsess_opt_mmap(struct mount *mp) { struct fuse_data *data = fuse_get_mpdata(mp); if (!fuse_mmap_enable || fuse_data_cache_mode == FUSE_CACHE_UC) return (false); return ((data->dataflags & (FSESS_NO_DATACACHE | FSESS_NO_MMAP)) == 0); } static inline bool fsess_opt_brokenio(struct mount *mp) { struct fuse_data *data = fuse_get_mpdata(mp); return (fuse_fix_broken_io || (data->dataflags & FSESS_BROKENIO)); } static inline void fuse_ms_push(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_data->ms_mtx, MA_OWNED); refcount_acquire(&ftick->tk_refcount); STAILQ_INSERT_TAIL(&ftick->tk_data->ms_head, ftick, tk_ms_link); } static inline struct fuse_ticket * fuse_ms_pop(struct fuse_data *data) { struct fuse_ticket *ftick = NULL; mtx_assert(&data->ms_mtx, MA_OWNED); if ((ftick = STAILQ_FIRST(&data->ms_head))) { STAILQ_REMOVE_HEAD(&data->ms_head, tk_ms_link); #ifdef INVARIANTS ftick->tk_ms_link.stqe_next = NULL; #endif } return (ftick); } static inline void fuse_aw_push(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_data->aw_mtx, MA_OWNED); refcount_acquire(&ftick->tk_refcount); TAILQ_INSERT_TAIL(&ftick->tk_data->aw_head, ftick, tk_aw_link); } static inline void fuse_aw_remove(struct fuse_ticket *ftick) { mtx_assert(&ftick->tk_data->aw_mtx, MA_OWNED); TAILQ_REMOVE(&ftick->tk_data->aw_head, ftick, tk_aw_link); #ifdef INVARIANTS ftick->tk_aw_link.tqe_next = NULL; ftick->tk_aw_link.tqe_prev = NULL; #endif } static inline struct fuse_ticket * fuse_aw_pop(struct fuse_data *data) { struct fuse_ticket *ftick; mtx_assert(&data->aw_mtx, MA_OWNED); if ((ftick = TAILQ_FIRST(&data->aw_head)) != NULL) fuse_aw_remove(ftick); return (ftick); } struct fuse_ticket *fuse_ticket_fetch(struct fuse_data *data); int fuse_ticket_drop(struct fuse_ticket *ftick); void fuse_insert_callback(struct fuse_ticket *ftick, fuse_handler_t *handler); void fuse_insert_message(struct fuse_ticket *ftick); static inline bool fuse_libabi_geq(struct fuse_data *data, uint32_t abi_maj, uint32_t abi_min) { return (data->fuse_libabi_major > abi_maj || (data->fuse_libabi_major == abi_maj && data->fuse_libabi_minor >= abi_min)); } struct fuse_data *fdata_alloc(struct cdev *dev, struct ucred *cred); void fdata_trydestroy(struct fuse_data *data); void fdata_set_dead(struct fuse_data *data); static inline bool fdata_get_dead(struct fuse_data *data) { return (data->dataflags & FSESS_DEAD); } struct fuse_dispatcher { struct fuse_ticket *tick; struct fuse_in_header *finh; void *indata; size_t iosize; uint64_t nodeid; int answ_stat; void *answ; }; static inline void fdisp_init(struct fuse_dispatcher *fdisp, size_t iosize) { fdisp->iosize = iosize; fdisp->tick = NULL; } static inline void fdisp_destroy(struct fuse_dispatcher *fdisp) { fuse_ticket_drop(fdisp->tick); #ifdef INVARIANTS fdisp->tick = NULL; #endif } void fdisp_refresh(struct fuse_dispatcher *fdip); void fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred); void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred); void fdisp_refresh_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred); int fdisp_wait_answ(struct fuse_dispatcher *fdip); static inline int fdisp_simple_putget_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred) { fdisp_make_vp(fdip, op, vp, td, cred); return (fdisp_wait_answ(fdip)); } #endif /* _FUSE_IPC_H_ */ Index: projects/fuse2/tests/sys/fs/fusefs/interrupt.cc =================================================================== --- projects/fuse2/tests/sys/fs/fusefs/interrupt.cc (revision 346338) +++ projects/fuse2/tests/sys/fs/fusefs/interrupt.cc (revision 346339) @@ -1,312 +1,328 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 The FreeBSD Foundation * * This software was developed by BFF Storage Systems, LLC under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ extern "C" { #include #include #include } #include "mockfs.hh" #include "utils.hh" using namespace testing; /* Don't do anything; all we care about is that the syscall gets interrupted */ void sigusr2_handler(int __unused sig) { - if (verbosity > 1) - printf("Signaled!\n"); + if (verbosity > 1) { + printf("Signaled! thread %p\n", pthread_self()); + } + } void* killer(void* target) { /* * Sleep for awhile so we can be mostly confident that the main thread * is already blocked in write(2) */ usleep(250'000); if (verbosity > 1) - printf("Signalling!\n"); - pthread_kill(*(pthread_t*)target, SIGUSR2); + printf("Signalling! thread %p\n", target); + pthread_kill((pthread_t)target, SIGUSR2); return(NULL); } class Interrupt: public FuseTest { public: pthread_t m_child; Interrupt(): m_child(NULL) {}; void expect_lookup(const char *relpath, uint64_t ino) { FuseTest::expect_lookup(relpath, ino, S_IFREG | 0644, 0, 1); } /* * Expect a FUSE_WRITE but don't reply. Instead, just record the unique value * to the provided pointer */ void expect_write(uint64_t ino, uint64_t *write_unique) { EXPECT_CALL(*m_mock, process( ResultOf([=](auto in) { return (in->header.opcode == FUSE_WRITE && in->header.nodeid == ino); }, Eq(true)), _) ).WillOnce(Invoke([=](auto in, auto &out __unused) { *write_unique = in->header.unique; })); } void setup_interruptor(pthread_t self) { ASSERT_EQ(0, signal(SIGUSR2, sigusr2_handler)) << strerror(errno); ASSERT_EQ(0, pthread_create(&m_child, NULL, killer, (void*)self)) << strerror(errno); } void TearDown() { + struct sigaction sa; + if (m_child != NULL) { pthread_join(m_child, NULL); } + bzero(&sa, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigaction(SIGUSR2, &sa, NULL); FuseTest::TearDown(); } }; /* * An interrupt operation that gets received after the original command is * complete should generate an EAGAIN response. */ /* https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=236530 */ -TEST_F(Interrupt, DISABLED_already_complete) +TEST_F(Interrupt, already_complete) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; ssize_t bufsize = strlen(CONTENTS); pthread_t self; uint64_t write_unique = 0; self = pthread_self(); expect_lookup(RELPATH, ino); expect_open(ino, 0, 1); - expect_getattr(ino, 0); expect_write(ino, &write_unique); EXPECT_CALL(*m_mock, process( ResultOf([&](auto in) { return (in->header.opcode == FUSE_INTERRUPT && in->body.interrupt.unique == write_unique); }, Eq(true)), _) ).WillOnce(Invoke([&](auto in, auto &out) { // First complete the write request auto out0 = new mockfs_buf_out; out0->header.unique = write_unique; SET_OUT_HEADER_LEN(out0, write); out0->body.write.size = bufsize; out.push_back(out0); // Then, respond EAGAIN to the interrupt request auto out1 = new mockfs_buf_out; out1->header.unique = in->header.unique; out1->header.error = -EAGAIN; out1->header.len = sizeof(out1->header); out.push_back(out1); })); fd = open(FULLPATH, O_WRONLY); ASSERT_LE(0, fd) << strerror(errno); setup_interruptor(self); - ASSERT_EQ(bufsize, write(fd, CONTENTS, bufsize)) << strerror(errno); + EXPECT_EQ(bufsize, write(fd, CONTENTS, bufsize)) << strerror(errno); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* * A FUSE filesystem is legally allowed to ignore INTERRUPT operations, and * complete the original operation whenever it damn well pleases. */ /* https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=236530 */ -TEST_F(Interrupt, DISABLED_ignore) +TEST_F(Interrupt, ignore) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; ssize_t bufsize = strlen(CONTENTS); pthread_t self; uint64_t write_unique; self = pthread_self(); expect_lookup(RELPATH, ino); expect_open(ino, 0, 1); - expect_getattr(ino, 0); expect_write(ino, &write_unique); EXPECT_CALL(*m_mock, process( - ResultOf([=](auto in) { + ResultOf([&](auto in) { return (in->header.opcode == FUSE_INTERRUPT && in->body.interrupt.unique == write_unique); }, Eq(true)), _) ).WillOnce(Invoke([&](auto in __unused, auto &out) { // Ignore FUSE_INTERRUPT; respond to the FUSE_WRITE auto out0 = new mockfs_buf_out; out0->header.unique = write_unique; SET_OUT_HEADER_LEN(out0, write); out0->body.write.size = bufsize; out.push_back(out0); })); fd = open(FULLPATH, O_WRONLY); ASSERT_LE(0, fd) << strerror(errno); setup_interruptor(self); ASSERT_EQ(bufsize, write(fd, CONTENTS, bufsize)) << strerror(errno); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* * A syscall that gets interrupted while blocking on FUSE I/O should send a * FUSE_INTERRUPT command to the fuse filesystem, which should then send EINTR * in response to the _original_ operation. The kernel should ultimately * return EINTR to userspace */ /* https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=236530 */ -TEST_F(Interrupt, DISABLED_in_progress) +TEST_F(Interrupt, in_progress) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; uint64_t ino = 42; int fd; ssize_t bufsize = strlen(CONTENTS); pthread_t self; uint64_t write_unique; self = pthread_self(); expect_lookup(RELPATH, ino); expect_open(ino, 0, 1); - expect_getattr(ino, 0); expect_write(ino, &write_unique); EXPECT_CALL(*m_mock, process( - ResultOf([=](auto in) { + ResultOf([&](auto in) { return (in->header.opcode == FUSE_INTERRUPT && in->body.interrupt.unique == write_unique); }, Eq(true)), _) ).WillOnce(Invoke([&](auto in __unused, auto &out) { auto out0 = new mockfs_buf_out; out0->header.error = -EINTR; out0->header.unique = write_unique; out0->header.len = sizeof(out0->header); out.push_back(out0); })); fd = open(FULLPATH, O_WRONLY); ASSERT_LE(0, fd) << strerror(errno); setup_interruptor(self); ASSERT_EQ(-1, write(fd, CONTENTS, bufsize)); EXPECT_EQ(EINTR, errno); /* Deliberately leak fd. close(2) will be tested in release.cc */ } /* * If the FUSE filesystem receives the FUSE_INTERRUPT operation before * processing the original, then it should wait for "some timeout" for the * original operation to arrive. If not, it should send EAGAIN to the * INTERRUPT operation, and the kernel should requeue the INTERRUPT. * * In this test, we'll pretend that the INTERRUPT arrives too soon, gets * EAGAINed, then the kernel requeues it, and the second time around it * successfully interrupts the original */ /* https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=236530 */ -TEST_F(Interrupt, DISABLED_too_soon) +TEST_F(Interrupt, too_soon) { const char FULLPATH[] = "mountpoint/some_file.txt"; const char RELPATH[] = "some_file.txt"; const char *CONTENTS = "abcdefgh"; + Sequence seq; uint64_t ino = 42; int fd; ssize_t bufsize = strlen(CONTENTS); pthread_t self; uint64_t write_unique; self = pthread_self(); expect_lookup(RELPATH, ino); expect_open(ino, 0, 1); - expect_getattr(ino, 0); expect_write(ino, &write_unique); EXPECT_CALL(*m_mock, process( - ResultOf([=](auto in) { + ResultOf([&](auto in) { return (in->header.opcode == FUSE_INTERRUPT && in->body.interrupt.unique == write_unique); }, Eq(true)), _) - ).WillOnce(Invoke(ReturnErrno(EAGAIN))) - .RetiresOnSaturation(); + ).InSequence(seq) + .WillOnce(Invoke(ReturnErrno(EAGAIN))); EXPECT_CALL(*m_mock, process( - ResultOf([=](auto in) { + ResultOf([&](auto in) { return (in->header.opcode == FUSE_INTERRUPT && in->body.interrupt.unique == write_unique); }, Eq(true)), _) - ).WillOnce(Invoke([&](auto in __unused, auto &out __unused) { + ).InSequence(seq) + .WillOnce(Invoke([&](auto in __unused, auto &out __unused) { auto out0 = new mockfs_buf_out; out0->header.error = -EINTR; out0->header.unique = write_unique; out0->header.len = sizeof(out0->header); out.push_back(out0); })); fd = open(FULLPATH, O_WRONLY); ASSERT_LE(0, fd) << strerror(errno); setup_interruptor(self); ASSERT_EQ(-1, write(fd, CONTENTS, bufsize)); EXPECT_EQ(EINTR, errno); /* Deliberately leak fd. close(2) will be tested in release.cc */ } + + +// TODO: add a test that uses siginterrupt and an interruptible signal +// TODO: add a test that verifies a process can be cleanly killed even if a +// FUSE_WRITE command never returns. +// TODO: write in-progress tests for read and other operations +// TODO: add a test where write returns EWOULDBLOCK +// TODO: test that if a fatal signal is received, fticket_wait_answer will +// return without waiting for a response to the interrupted operation. +// TODO: test that operations that haven't been received by the server can be +// interrupted without generating a FUSE_INTERRUPT. Index: projects/fuse2/tests/sys/fs/fusefs/mockfs.cc =================================================================== --- projects/fuse2/tests/sys/fs/fusefs/mockfs.cc (revision 346338) +++ projects/fuse2/tests/sys/fs/fusefs/mockfs.cc (revision 346339) @@ -1,485 +1,488 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 The FreeBSD Foundation * * This software was developed by BFF Storage Systems, LLC under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ extern "C" { #include #include #include #include #include #include #include #include #include #include #include #include "mntopts.h" // for build_iovec } #include #include "mockfs.hh" using namespace testing; int verbosity = 0; static sig_atomic_t quit = 0; const char* opcode2opname(uint32_t opcode) { const int NUM_OPS = 39; const char* table[NUM_OPS] = { "Unknown (opcode 0)", "LOOKUP", "FORGET", "GETATTR", "SETATTR", "READLINK", "SYMLINK", "Unknown (opcode 7)", "MKNOD", "MKDIR", "UNLINK", "RMDIR", "RENAME", "LINK", "OPEN", "READ", "WRITE", "STATFS", "RELEASE", "Unknown (opcode 19)", "FSYNC", "SETXATTR", "GETXATTR", "LISTXATTR", "REMOVEXATTR", "FLUSH", "INIT", "OPENDIR", "READDIR", "RELEASEDIR", "FSYNCDIR", "GETLK", "SETLK", "SETLKW", "ACCESS", "CREATE", "INTERRUPT", "BMAP", "DESTROY" }; if (opcode >= NUM_OPS) return ("Unknown (opcode > max)"); else return (table[opcode]); } ProcessMockerT ReturnErrno(int error) { return([=](auto in, auto &out) { auto out0 = new mockfs_buf_out; out0->header.unique = in->header.unique; out0->header.error = -error; out0->header.len = sizeof(out0->header); out.push_back(out0); }); } /* Helper function used for returning negative cache entries for LOOKUP */ ProcessMockerT ReturnNegativeCache(const struct timespec *entry_valid) { return([=](auto in, auto &out) { /* nodeid means ENOENT and cache it */ auto out0 = new mockfs_buf_out; out0->body.entry.nodeid = 0; out0->header.unique = in->header.unique; out0->header.error = 0; out0->body.entry.entry_valid = entry_valid->tv_sec; out0->body.entry.entry_valid_nsec = entry_valid->tv_nsec; SET_OUT_HEADER_LEN(out0, entry); out.push_back(out0); }); } ProcessMockerT ReturnImmediate(std::function f) { return([=](auto in, auto &out) { auto out0 = new mockfs_buf_out; out0->header.unique = in->header.unique; f(in, out0); out.push_back(out0); }); } void sigint_handler(int __unused sig) { quit = 1; } void debug_fuseop(const mockfs_buf_in *in) { printf("%-11s ino=%2lu", opcode2opname(in->header.opcode), in->header.nodeid); if (verbosity > 1) { printf(" uid=%5u gid=%5u pid=%5u unique=%lu len=%u", in->header.uid, in->header.gid, in->header.pid, in->header.unique, in->header.len); } switch (in->header.opcode) { const char *name, *value; case FUSE_ACCESS: printf(" mask=%#x", in->body.access.mask); break; case FUSE_CREATE: name = (const char*)in->body.bytes + sizeof(fuse_open_in); printf(" flags=%#x name=%s", in->body.open.flags, name); break; case FUSE_FLUSH: printf(" fh=%#lx lock_owner=%lu", in->body.flush.fh, in->body.flush.lock_owner); break; case FUSE_FORGET: printf(" nlookup=%lu", in->body.forget.nlookup); break; case FUSE_FSYNC: printf(" flags=%#x", in->body.fsync.fsync_flags); break; case FUSE_FSYNCDIR: printf(" flags=%#x", in->body.fsyncdir.fsync_flags); break; + case FUSE_INTERRUPT: + printf(" unique=%lu", in->body.interrupt.unique); + break; case FUSE_LOOKUP: printf(" %s", in->body.lookup); break; case FUSE_MKNOD: printf(" mode=%#o rdev=%x", in->body.mknod.mode, in->body.mknod.rdev); break; case FUSE_OPEN: printf(" flags=%#x mode=%#o", in->body.open.flags, in->body.open.mode); break; case FUSE_OPENDIR: printf(" flags=%#x mode=%#o", in->body.opendir.flags, in->body.opendir.mode); break; case FUSE_READ: printf(" offset=%lu size=%u", in->body.read.offset, in->body.read.size); break; case FUSE_READDIR: printf(" fh=%#lx offset=%lu size=%u", in->body.readdir.fh, in->body.readdir.offset, in->body.readdir.size); break; case FUSE_RELEASE: printf(" fh=%#lx flags=%#x lock_owner=%lu", in->body.release.fh, in->body.release.flags, in->body.release.lock_owner); break; case FUSE_SETATTR: if (verbosity <= 1) { printf(" valid=%#x", in->body.setattr.valid); break; } if (in->body.setattr.valid & FATTR_MODE) printf(" mode=%#o", in->body.setattr.mode); if (in->body.setattr.valid & FATTR_UID) printf(" uid=%u", in->body.setattr.uid); if (in->body.setattr.valid & FATTR_GID) printf(" gid=%u", in->body.setattr.gid); if (in->body.setattr.valid & FATTR_SIZE) printf(" size=%zu", in->body.setattr.size); if (in->body.setattr.valid & FATTR_ATIME) printf(" atime=%zu.%u", in->body.setattr.atime, in->body.setattr.atimensec); if (in->body.setattr.valid & FATTR_MTIME) printf(" mtime=%zu.%u", in->body.setattr.mtime, in->body.setattr.mtimensec); if (in->body.setattr.valid & FATTR_FH) printf(" fh=%zu", in->body.setattr.fh); break; case FUSE_SETLK: printf(" fh=%#lx owner=%lu type=%u pid=%u", in->body.setlk.fh, in->body.setlk.owner, in->body.setlk.lk.type, in->body.setlk.lk.pid); if (verbosity >= 2) { printf(" range=[%lu-%lu]", in->body.setlk.lk.start, in->body.setlk.lk.end); } break; case FUSE_SETXATTR: /* * In theory neither the xattr name and value need be * ASCII, but in this test suite they always are. */ name = (const char*)in->body.bytes + sizeof(fuse_setxattr_in); value = name + strlen(name) + 1; printf(" %s=%s", name, value); break; case FUSE_WRITE: printf(" fh=%#lx offset=%lu size=%u flags=%u", in->body.write.fh, in->body.write.offset, in->body.write.size, in->body.write.write_flags); break; default: break; } printf("\n"); } MockFS::MockFS(int max_readahead, bool allow_other, bool default_permissions, bool push_symlinks_in, bool ro, uint32_t flags) { struct iovec *iov = NULL; int iovlen = 0; char fdstr[15]; const bool trueval = true; m_daemon_id = NULL; m_maxreadahead = max_readahead; quit = 0; /* * Kyua sets pwd to a testcase-unique tempdir; no need to use * mkdtemp */ /* * googletest doesn't allow ASSERT_ in constructors, so we must throw * instead. */ if (mkdir("mountpoint" , 0755) && errno != EEXIST) throw(std::system_error(errno, std::system_category(), "Couldn't make mountpoint directory")); m_fuse_fd = open("/dev/fuse", O_CLOEXEC | O_RDWR); if (m_fuse_fd < 0) throw(std::system_error(errno, std::system_category(), "Couldn't open /dev/fuse")); sprintf(fdstr, "%d", m_fuse_fd); m_pid = getpid(); m_child_pid = -1; build_iovec(&iov, &iovlen, "fstype", __DECONST(void *, "fusefs"), -1); build_iovec(&iov, &iovlen, "fspath", __DECONST(void *, "mountpoint"), -1); build_iovec(&iov, &iovlen, "from", __DECONST(void *, "/dev/fuse"), -1); build_iovec(&iov, &iovlen, "fd", fdstr, -1); if (allow_other) { build_iovec(&iov, &iovlen, "allow_other", __DECONST(void*, &trueval), sizeof(bool)); } if (default_permissions) { build_iovec(&iov, &iovlen, "default_permissions", __DECONST(void*, &trueval), sizeof(bool)); } if (push_symlinks_in) { build_iovec(&iov, &iovlen, "push_symlinks_in", __DECONST(void*, &trueval), sizeof(bool)); } if (ro) { build_iovec(&iov, &iovlen, "ro", __DECONST(void*, &trueval), sizeof(bool)); } if (nmount(iov, iovlen, 0)) throw(std::system_error(errno, std::system_category(), "Couldn't mount filesystem")); // Setup default handler ON_CALL(*this, process(_, _)) .WillByDefault(Invoke(this, &MockFS::process_default)); init(flags); signal(SIGUSR1, sigint_handler); if (pthread_create(&m_daemon_id, NULL, service, (void*)this)) throw(std::system_error(errno, std::system_category(), "Couldn't Couldn't start fuse thread")); } MockFS::~MockFS() { kill_daemon(); ::unmount("mountpoint", MNT_FORCE); if (m_daemon_id != NULL) { pthread_join(m_daemon_id, NULL); m_daemon_id = NULL; } rmdir("mountpoint"); } void MockFS::init(uint32_t flags) { mockfs_buf_in *in; mockfs_buf_out *out; in = (mockfs_buf_in*) malloc(sizeof(*in)); ASSERT_TRUE(in != NULL); out = (mockfs_buf_out*) malloc(sizeof(*out)); ASSERT_TRUE(out != NULL); read_request(in); ASSERT_EQ(FUSE_INIT, in->header.opcode); memset(out, 0, sizeof(*out)); out->header.unique = in->header.unique; out->header.error = 0; out->body.init.major = FUSE_KERNEL_VERSION; out->body.init.minor = FUSE_KERNEL_MINOR_VERSION; out->body.init.flags = in->body.init.flags & flags; /* * The default max_write is set to this formula in libfuse, though * individual filesystems can lower it. The "- 4096" was added in * commit 154ffe2, with the commit message "fix". */ uint32_t default_max_write = 32 * getpagesize() + 0x1000 - 4096; /* For testing purposes, it should be distinct from MAXPHYS */ m_max_write = MIN(default_max_write, MAXPHYS / 2); out->body.init.max_write = m_max_write; out->body.init.max_readahead = m_maxreadahead; SET_OUT_HEADER_LEN(out, init); write(m_fuse_fd, out, out->header.len); free(in); } void MockFS::kill_daemon() { if (m_daemon_id != NULL) { pthread_kill(m_daemon_id, SIGUSR1); // Closing the /dev/fuse file descriptor first allows unmount // to succeed even if the daemon doesn't correctly respond to // commands during the unmount sequence. close(m_fuse_fd); } } void MockFS::loop() { mockfs_buf_in *in; std::vector out; in = (mockfs_buf_in*) malloc(sizeof(*in)); ASSERT_TRUE(in != NULL); while (!quit) { bzero(in, sizeof(*in)); read_request(in); if (quit) break; if (verbosity > 0) debug_fuseop(in); if (pid_ok((pid_t)in->header.pid)) { process(in, out); } else { /* * Reject any requests from unknown processes. Because * we actually do mount a filesystem, plenty of * unrelated system daemons may try to access it. */ process_default(in, out); } for (auto &it: out) { ASSERT_TRUE(write(m_fuse_fd, it, it->header.len) > 0 || errno == EAGAIN) << strerror(errno); delete it; } out.clear(); } free(in); } bool MockFS::pid_ok(pid_t pid) { if (pid == m_pid) { return (true); } else if (pid == m_child_pid) { return (true); } else { struct kinfo_proc *ki; bool ok = false; ki = kinfo_getproc(pid); if (ki == NULL) return (false); /* * Allow access by the aio daemon processes so that our tests * can use aio functions */ if (0 == strncmp("aiod", ki->ki_comm, 4)) ok = true; free(ki); return (ok); } } void MockFS::process_default(const mockfs_buf_in *in, std::vector &out) { auto out0 = new mockfs_buf_out; out0->header.unique = in->header.unique; out0->header.error = -EOPNOTSUPP; out0->header.len = sizeof(out0->header); out.push_back(out0); } void MockFS::read_request(mockfs_buf_in *in) { ssize_t res; res = read(m_fuse_fd, in, sizeof(*in)); if (res < 0 && !quit) perror("read"); ASSERT_TRUE(res >= (ssize_t)sizeof(in->header) || quit); } void* MockFS::service(void *pthr_data) { MockFS *mock_fs = (MockFS*)pthr_data; mock_fs->loop(); return (NULL); } void MockFS::unmount() { ::unmount("mountpoint", 0); }