Index: projects/pnfs-server/sys/fs/fuse/fuse_file.c =================================================================== --- projects/pnfs-server/sys/fs/fuse/fuse_file.c (revision 297886) +++ projects/pnfs-server/sys/fs/fuse/fuse_file.c (revision 297887) @@ -1,259 +1,287 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" #define FUSE_DEBUG_MODULE FILE #include "fuse_debug.h" static int fuse_fh_count = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, filehandle_count, CTLFLAG_RD, &fuse_fh_count, 0, ""); +extern int fuse_force_directio; + int fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type, struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred) { struct fuse_dispatcher fdi; struct fuse_open_in *foi; struct fuse_open_out *foo; int err = 0; int isdir = 0; int oflags = 0; int op = FUSE_OPEN; fuse_trace_printf("fuse_filehandle_open(vp=%p, fufh_type=%d)\n", vp, fufh_type); if (fuse_filehandle_valid(vp, fufh_type)) { panic("FUSE: filehandle_open called despite valid fufh (type=%d)", fufh_type); /* NOTREACHED */ } /* * Note that this means we are effectively FILTERING OUT open() flags. */ oflags = fuse_filehandle_xlate_to_oflags(fufh_type); if (vnode_isdir(vp)) { isdir = 1; op = FUSE_OPENDIR; if (fufh_type != FUFH_RDONLY) { printf("FUSE:non-rdonly fh requested for a directory?\n"); fufh_type = FUFH_RDONLY; } } fdisp_init(&fdi, sizeof(*foi)); fdisp_make_vp(&fdi, op, vp, td, cred); foi = fdi.indata; foi->flags = oflags; if ((err = fdisp_wait_answ(&fdi))) { debug_printf("OUCH ... daemon didn't give fh (err = %d)\n", err); if (err == ENOENT) { fuse_internal_vnode_disappear(vp); } goto out; } foo = fdi.answ; fuse_filehandle_init(vp, fufh_type, fufhp, foo->fh); - fuse_vnode_open(vp, foo->open_flags, td); + if (fufh_type == FUFH_WRONLY || fuse_force_directio > 0) + fuse_vnode_open(vp, foo->open_flags | FOPEN_DIRECT_IO, td); + else + fuse_vnode_open(vp, foo->open_flags, td); + out: fdisp_destroy(&fdi); return err; } int fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type, struct thread *td, struct ucred *cred) { struct fuse_dispatcher fdi; struct fuse_release_in *fri; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh = NULL; int err = 0; int isdir = 0; int op = FUSE_RELEASE; fuse_trace_printf("fuse_filehandle_put(vp=%p, fufh_type=%d)\n", vp, fufh_type); fufh = &(fvdat->fufh[fufh_type]); if (!FUFH_IS_VALID(fufh)) { panic("FUSE: filehandle_put called on invalid fufh (type=%d)", fufh_type); /* NOTREACHED */ } if (fuse_isdeadfs(vp)) { goto out; } if (vnode_isdir(vp)) { op = FUSE_RELEASEDIR; isdir = 1; } fdisp_init(&fdi, sizeof(*fri)); fdisp_make_vp(&fdi, op, vp, td, cred); fri = fdi.indata; fri->fh = fufh->fh_id; fri->flags = fuse_filehandle_xlate_to_oflags(fufh_type); err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); out: atomic_subtract_acq_int(&fuse_fh_count, 1); fufh->fh_id = (uint64_t)-1; fufh->fh_type = FUFH_INVALID; return err; } int fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; fufh = &(fvdat->fufh[fufh_type]); return FUFH_IS_VALID(fufh); +} + +/* + * Check for a valid file handle, first the type requested, but if that + * isn't valid, try for FUFH_RDWR. + * Return the FUFH type that is valid or FUFH_INVALID if there are none. + * This is a variant of fuse_filehandle_vaild() analygous to + * fuse_filehandle_getrw(). + */ +fufh_type_t +fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type) +{ + struct fuse_vnode_data *fvdat = VTOFUD(vp); + struct fuse_filehandle *fufh; + + fufh = &fvdat->fufh[fufh_type]; + if (FUFH_IS_VALID(fufh) != 0) + return (fufh_type); + fufh = &fvdat->fufh[FUFH_RDWR]; + if (FUFH_IS_VALID(fufh) != 0) + return (FUFH_RDWR); + return (FUFH_INVALID); } int fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type, struct fuse_filehandle **fufhp) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; fufh = &(fvdat->fufh[fufh_type]); if (!FUFH_IS_VALID(fufh)) return EBADF; if (fufhp != NULL) *fufhp = fufh; return 0; } int fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type, struct fuse_filehandle **fufhp) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; fufh = &(fvdat->fufh[fufh_type]); if (!FUFH_IS_VALID(fufh)) { fufh_type = FUFH_RDWR; } return fuse_filehandle_get(vp, fufh_type, fufhp); } void fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type, struct fuse_filehandle **fufhp, uint64_t fh_id) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; FS_DEBUG("id=%jd type=%d\n", (intmax_t)fh_id, fufh_type); fufh = &(fvdat->fufh[fufh_type]); MPASS(!FUFH_IS_VALID(fufh)); fufh->fh_id = fh_id; fufh->fh_type = fufh_type; if (!FUFH_IS_VALID(fufh)) { panic("FUSE: init: invalid filehandle id (type=%d)", fufh_type); } if (fufhp != NULL) *fufhp = fufh; atomic_add_acq_int(&fuse_fh_count, 1); } Index: projects/pnfs-server/sys/fs/fuse/fuse_file.h =================================================================== --- projects/pnfs-server/sys/fs/fuse/fuse_file.h (revision 297886) +++ projects/pnfs-server/sys/fs/fuse/fuse_file.h (revision 297887) @@ -1,153 +1,154 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _FUSE_FILE_H_ #define _FUSE_FILE_H_ #include #include #include #include #include typedef enum fufh_type { FUFH_INVALID = -1, FUFH_RDONLY = 0, FUFH_WRONLY = 1, FUFH_RDWR = 2, FUFH_MAXTYPE = 3, } fufh_type_t; struct fuse_filehandle { uint64_t fh_id; fufh_type_t fh_type; }; #define FUFH_IS_VALID(f) ((f)->fh_type != FUFH_INVALID) static __inline__ fufh_type_t fuse_filehandle_xlate_from_mmap(int fflags) { if (fflags & (PROT_READ | PROT_WRITE)) { return FUFH_RDWR; } else if (fflags & (PROT_WRITE)) { return FUFH_WRONLY; } else if ((fflags & PROT_READ) || (fflags & PROT_EXEC)) { return FUFH_RDONLY; } else { return FUFH_INVALID; } } static __inline__ fufh_type_t fuse_filehandle_xlate_from_fflags(int fflags) { if ((fflags & FREAD) && (fflags & FWRITE)) { return FUFH_RDWR; } else if (fflags & (FWRITE)) { return FUFH_WRONLY; } else if (fflags & (FREAD)) { return FUFH_RDONLY; } else { panic("FUSE: What kind of a flag is this (%x)?", fflags); } } static __inline__ int fuse_filehandle_xlate_to_oflags(fufh_type_t type) { int oflags = -1; switch (type) { case FUFH_RDONLY: oflags = O_RDONLY; break; case FUFH_WRONLY: oflags = O_WRONLY; break; case FUFH_RDWR: oflags = O_RDWR; break; default: break; } return oflags; } int fuse_filehandle_valid(struct vnode *vp, fufh_type_t fufh_type); +fufh_type_t fuse_filehandle_validrw(struct vnode *vp, fufh_type_t fufh_type); int fuse_filehandle_get(struct vnode *vp, fufh_type_t fufh_type, struct fuse_filehandle **fufhp); int fuse_filehandle_getrw(struct vnode *vp, fufh_type_t fufh_type, struct fuse_filehandle **fufhp); void fuse_filehandle_init(struct vnode *vp, fufh_type_t fufh_type, struct fuse_filehandle **fufhp, uint64_t fh_id); int fuse_filehandle_open(struct vnode *vp, fufh_type_t fufh_type, struct fuse_filehandle **fufhp, struct thread *td, struct ucred *cred); int fuse_filehandle_close(struct vnode *vp, fufh_type_t fufh_type, struct thread *td, struct ucred *cred); #endif /* _FUSE_FILE_H_ */ Index: projects/pnfs-server/sys/fs/fuse/fuse_internal.c =================================================================== --- projects/pnfs-server/sys/fs/fuse/fuse_internal.c (revision 297886) +++ projects/pnfs-server/sys/fs/fuse/fuse_internal.c (revision 297887) @@ -1,649 +1,696 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_file.h" #include "fuse_param.h" #define FUSE_DEBUG_MODULE INTERNAL #include "fuse_debug.h" #ifdef ZERO_PAD_INCOMPLETE_BUFS static int isbzero(void *buf, size_t len); #endif /* access */ int fuse_internal_access(struct vnode *vp, mode_t mode, struct fuse_access_param *facp, struct thread *td, struct ucred *cred) { int err = 0; uint32_t mask = 0; int dataflags; int vtype; struct mount *mp; struct fuse_dispatcher fdi; struct fuse_access_in *fai; struct fuse_data *data; /* NOT YET DONE */ /* * If this vnop gives you trouble, just return 0 here for a lazy * kludge. */ /* return 0;*/ fuse_trace_printf_func(); mp = vnode_mount(vp); vtype = vnode_vtype(vp); data = fuse_get_mpdata(mp); dataflags = data->dataflags; if ((mode & VWRITE) && vfs_isrdonly(mp)) { return EACCES; } /* Unless explicitly permitted, deny everyone except the fs owner. */ if (vnode_isvroot(vp) && !(facp->facc_flags & FACCESS_NOCHECKSPY)) { if (!(dataflags & FSESS_DAEMON_CAN_SPY)) { int denied = fuse_match_cred(data->daemoncred, cred); if (denied) { return EPERM; } } facp->facc_flags |= FACCESS_NOCHECKSPY; } if (!(facp->facc_flags & FACCESS_DO_ACCESS)) { return 0; } if (((vtype == VREG) && (mode & VEXEC))) { #ifdef NEED_MOUNT_ARGUMENT_FOR_THIS /* Let the kernel handle this through open / close heuristics.*/ return ENOTSUP; #else /* Let the kernel handle this. */ return 0; #endif } if (!fsess_isimpl(mp, FUSE_ACCESS)) { /* Let the kernel handle this. */ return 0; } if (dataflags & FSESS_DEFAULT_PERMISSIONS) { /* Let the kernel handle this. */ return 0; } if ((mode & VADMIN) != 0) { err = priv_check_cred(cred, PRIV_VFS_ADMIN, 0); if (err) { return err; } } if ((mode & (VWRITE | VAPPEND | VADMIN)) != 0) { mask |= W_OK; } if ((mode & VREAD) != 0) { mask |= R_OK; } if ((mode & VEXEC) != 0) { mask |= X_OK; } bzero(&fdi, sizeof(fdi)); fdisp_init(&fdi, sizeof(*fai)); fdisp_make_vp(&fdi, FUSE_ACCESS, vp, td, cred); fai = fdi.indata; fai->mask = F_OK; fai->mask |= mask; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_ACCESS); err = 0; } return err; } /* fsync */ int fuse_internal_fsync_callback(struct fuse_ticket *tick, struct uio *uio) { fuse_trace_printf_func(); if (tick->tk_aw_ohead.error == ENOSYS) { fsess_set_notimpl(tick->tk_data->mp, fticket_opcode(tick)); } return 0; } int fuse_internal_fsync(struct vnode *vp, struct thread *td, struct ucred *cred, struct fuse_filehandle *fufh) { int op = FUSE_FSYNC; struct fuse_fsync_in *ffsi; struct fuse_dispatcher fdi; fuse_trace_printf_func(); if (vnode_isdir(vp)) { op = FUSE_FSYNCDIR; } fdisp_init(&fdi, sizeof(*ffsi)); fdisp_make_vp(&fdi, op, vp, td, cred); ffsi = fdi.indata; ffsi->fh = fufh->fh_id; ffsi->fsync_flags = 1; /* datasync */ fuse_insert_callback(fdi.tick, fuse_internal_fsync_callback); fuse_insert_message(fdi.tick); fdisp_destroy(&fdi); return 0; } /* readdir */ int fuse_internal_readdir(struct vnode *vp, struct uio *uio, + off_t startoff, struct fuse_filehandle *fufh, - struct fuse_iov *cookediov) + struct fuse_iov *cookediov, + int *ncookies, + u_long *cookies) { int err = 0; struct fuse_dispatcher fdi; struct fuse_read_in *fri; + int fnd_start; - if (uio_resid(uio) == 0) { - return 0; - } + if (uio_resid(uio) == 0) + return (0); fdisp_init(&fdi, 0); /* * Note that we DO NOT have a UIO_SYSSPACE here (so no need for p2p * I/O). */ + /* + * fnd_start is set non-zero once the offset in the directory gets + * to the startoff. This is done because directories must be read + * from the beginning (offset == 0) when fuse_vnop_readdir() needs + * to do an open of the directory. + * If it is not set non-zero here, it will be set non-zero in + * fuse_internal_readdir_processdata() when uio_offset == startoff. + */ + fnd_start = 0; + if (uio->uio_offset == startoff) + fnd_start = 1; while (uio_resid(uio) > 0) { - fdi.iosize = sizeof(*fri); fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL); fri = fdi.indata; fri->fh = fufh->fh_id; fri->offset = uio_offset(uio); fri->size = min(uio_resid(uio), FUSE_DEFAULT_IOSIZE); - /* mp->max_read */ - if ((err = fdisp_wait_answ(&fdi))) { + if ((err = fdisp_wait_answ(&fdi)) != 0) break; - } - if ((err = fuse_internal_readdir_processdata(uio, fri->size, fdi.answ, - fdi.iosize, cookediov))) { + if ((err = fuse_internal_readdir_processdata(uio, startoff, + &fnd_start, fri->size, fdi.answ, fdi.iosize, cookediov, + ncookies, &cookies)) != 0) break; - } } fdisp_destroy(&fdi); return ((err == -1) ? 0 : err); } +/* + * Return -1 to indicate that this readdir is finished, 0 if it copied + * all the directory data read in and it may be possible to read more + * and greater than 0 for a failure. + */ int fuse_internal_readdir_processdata(struct uio *uio, + off_t startoff, + int *fnd_start, size_t reqsize, void *buf, size_t bufsize, - void *param) + struct fuse_iov *cookediov, + int *ncookies, + u_long **cookiesp) { int err = 0; - int cou = 0; int bytesavail; size_t freclen; - struct dirent *de; struct fuse_dirent *fudge; - struct fuse_iov *cookediov = param; + u_long *cookies; + cookies = *cookiesp; if (bufsize < FUSE_NAME_OFFSET) { - return -1; + return (-1); } for (;;) { if (bufsize < FUSE_NAME_OFFSET) { - err = -1; + /* + * Return 0 since we have done at least one loop + * iteration and we are at the end of the buffer. + * As such, we might be able to read more directory + * data. + */ + err = 0; break; } fudge = (struct fuse_dirent *)buf; freclen = FUSE_DIRENT_SIZE(fudge); - cou++; - if (bufsize < freclen) { - err = ((cou == 1) ? -1 : 0); + /* + * This indicates a partial directory entry at the + * end of the directory data. + */ + err = -1; break; } #ifdef ZERO_PAD_INCOMPLETE_BUFS if (isbzero(buf, FUSE_NAME_OFFSET)) { err = -1; break; } #endif - if (!fudge->namelen || fudge->namelen > MAXNAMLEN) { + if (fudge->namelen == 0 || fudge->namelen > MAXNAMLEN) { err = EINVAL; break; } - bytesavail = GENERIC_DIRSIZ((struct pseudo_dirent *) - &fudge->namelen); + /* bytesavail is the size of the BSD dirent. */ + bytesavail = GENERIC_DIRSIZ((struct pseudo_dirent *) + &fudge->namelen); if (bytesavail > uio_resid(uio)) { + /* Out of space for the dir so we are done. */ err = -1; break; } - fiov_refresh(cookediov); - fiov_adjust(cookediov, bytesavail); - de = (struct dirent *)cookediov->base; - de->d_fileno = fudge->ino; /* XXX: truncation */ - de->d_reclen = bytesavail; - de->d_type = fudge->type; - de->d_namlen = fudge->namelen; - memcpy((char *)cookediov->base + sizeof(struct dirent) - - MAXNAMLEN - 1, - (char *)buf + FUSE_NAME_OFFSET, fudge->namelen); - ((char *)cookediov->base)[bytesavail] = '\0'; + /* + * Don't start to copy the directory entries out until + * the requested offset in the directory is found. + */ + if (*fnd_start != 0) { + fiov_refresh(cookediov); + fiov_adjust(cookediov, bytesavail); + de = (struct dirent *)cookediov->base; + de->d_fileno = fudge->ino; /* XXX: truncation */ + de->d_reclen = bytesavail; + de->d_type = fudge->type; + de->d_namlen = fudge->namelen; + memcpy((char *)cookediov->base + + sizeof(struct dirent) - MAXNAMLEN - 1, + (char *)buf + FUSE_NAME_OFFSET, fudge->namelen); + ((char *)cookediov->base)[bytesavail] = '\0'; + + err = uiomove(cookediov->base, cookediov->len, uio); + if (err != 0) + break; + if (cookies != NULL) { + if (*ncookies == 0) { + err = -1; + break; + } + *cookies = fudge->off; + cookies++; + (*ncookies)--; + } + } else if (startoff == fudge->off) + *fnd_start = 1; - err = uiomove(cookediov->base, cookediov->len, uio); - if (err) { - break; - } + /* Move to the next fuse directory entry. */ + uio_setoffset(uio, fudge->off); buf = (char *)buf + freclen; bufsize -= freclen; - uio_setoffset(uio, fudge->off); } + *cookiesp = cookies; - return err; + return (err); } /* remove */ #define INVALIDATE_CACHED_VATTRS_UPON_UNLINK 1 int fuse_internal_remove(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, enum fuse_opcode op) { struct fuse_dispatcher fdi; struct vattr *vap = VTOVA(vp); #if INVALIDATE_CACHED_VATTRS_UPON_UNLINK int need_invalidate = 0; uint64_t target_nlink = 0; #endif int err = 0; debug_printf("dvp=%p, cnp=%p, op=%d\n", vp, cnp, op); fdisp_init(&fdi, cnp->cn_namelen + 1); fdisp_make_vp(&fdi, op, dvp, cnp->cn_thread, cnp->cn_cred); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; #if INVALIDATE_CACHED_VATTRS_UPON_UNLINK if (vap->va_nlink > 1) { need_invalidate = 1; target_nlink = vap->va_nlink; } #endif err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); return err; } /* rename */ int fuse_internal_rename(struct vnode *fdvp, struct componentname *fcnp, struct vnode *tdvp, struct componentname *tcnp) { struct fuse_dispatcher fdi; struct fuse_rename_in *fri; int err = 0; fdisp_init(&fdi, sizeof(*fri) + fcnp->cn_namelen + tcnp->cn_namelen + 2); fdisp_make_vp(&fdi, FUSE_RENAME, fdvp, tcnp->cn_thread, tcnp->cn_cred); fri = fdi.indata; fri->newdir = VTOI(tdvp); memcpy((char *)fdi.indata + sizeof(*fri), fcnp->cn_nameptr, fcnp->cn_namelen); ((char *)fdi.indata)[sizeof(*fri) + fcnp->cn_namelen] = '\0'; memcpy((char *)fdi.indata + sizeof(*fri) + fcnp->cn_namelen + 1, tcnp->cn_nameptr, tcnp->cn_namelen); ((char *)fdi.indata)[sizeof(*fri) + fcnp->cn_namelen + tcnp->cn_namelen + 1] = '\0'; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); return err; } /* strategy */ /* entity creation */ void fuse_internal_newentry_makerequest(struct mount *mp, uint64_t dnid, struct componentname *cnp, enum fuse_opcode op, void *buf, size_t bufsize, struct fuse_dispatcher *fdip) { debug_printf("fdip=%p\n", fdip); fdip->iosize = bufsize + cnp->cn_namelen + 1; fdisp_make(fdip, op, mp, dnid, cnp->cn_thread, cnp->cn_cred); memcpy(fdip->indata, buf, bufsize); memcpy((char *)fdip->indata + bufsize, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[bufsize + cnp->cn_namelen] = '\0'; } int fuse_internal_newentry_core(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, enum vtype vtyp, struct fuse_dispatcher *fdip) { int err = 0; struct fuse_entry_out *feo; struct mount *mp = vnode_mount(dvp); if ((err = fdisp_wait_answ(fdip))) { return err; } feo = fdip->answ; if ((err = fuse_internal_checkentry(feo, vtyp))) { return err; } err = fuse_vnode_get(mp, feo->nodeid, dvp, vpp, cnp, vtyp); if (err) { fuse_internal_forget_send(mp, cnp->cn_thread, cnp->cn_cred, feo->nodeid, 1); return err; } cache_attrs(*vpp, feo); return err; } int fuse_internal_newentry(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, enum fuse_opcode op, void *buf, size_t bufsize, enum vtype vtype) { int err; struct fuse_dispatcher fdi; struct mount *mp = vnode_mount(dvp); fdisp_init(&fdi, 0); fuse_internal_newentry_makerequest(mp, VTOI(dvp), cnp, op, buf, bufsize, &fdi); err = fuse_internal_newentry_core(dvp, vpp, cnp, vtype, &fdi); fdisp_destroy(&fdi); return err; } /* entity destruction */ int fuse_internal_forget_callback(struct fuse_ticket *ftick, struct uio *uio) { fuse_internal_forget_send(ftick->tk_data->mp, curthread, NULL, ((struct fuse_in_header *)ftick->tk_ms_fiov.base)->nodeid, 1); return 0; } void fuse_internal_forget_send(struct mount *mp, struct thread *td, struct ucred *cred, uint64_t nodeid, uint64_t nlookup) { struct fuse_dispatcher fdi; struct fuse_forget_in *ffi; debug_printf("mp=%p, nodeid=%ju, nlookup=%ju\n", mp, (uintmax_t)nodeid, (uintmax_t)nlookup); /* * KASSERT(nlookup > 0, ("zero-times forget for vp #%llu", * (long long unsigned) nodeid)); */ fdisp_init(&fdi, sizeof(*ffi)); fdisp_make(&fdi, FUSE_FORGET, mp, nodeid, td, cred); ffi = fdi.indata; ffi->nlookup = nlookup; fuse_insert_message(fdi.tick); fdisp_destroy(&fdi); } void fuse_internal_vnode_disappear(struct vnode *vp) { struct fuse_vnode_data *fvdat = VTOFUD(vp); ASSERT_VOP_ELOCKED(vp, "fuse_internal_vnode_disappear"); fvdat->flag |= FN_REVOKED; cache_purge(vp); } /* fuse start/stop */ int fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio) { int err = 0; struct fuse_data *data = tick->tk_data; struct fuse_init_out *fiio; if ((err = tick->tk_aw_ohead.error)) { goto out; } if ((err = fticket_pull(tick, uio))) { goto out; } fiio = fticket_resp(tick)->base; /* XXX: Do we want to check anything further besides this? */ if (fiio->major < 7) { debug_printf("userpace version too low\n"); err = EPROTONOSUPPORT; goto out; } data->fuse_libabi_major = fiio->major; data->fuse_libabi_minor = fiio->minor; if (fuse_libabi_geq(data, 7, 5)) { if (fticket_resp(tick)->len == sizeof(struct fuse_init_out)) { data->max_write = fiio->max_write; } else { err = EINVAL; } } else { /* Old fix values */ data->max_write = 4096; } out: if (err) { fdata_set_dead(data); } FUSE_LOCK(); data->dataflags |= FSESS_INITED; wakeup(&data->ticketer); FUSE_UNLOCK(); return 0; } void fuse_internal_send_init(struct fuse_data *data, struct thread *td) { struct fuse_init_in *fiii; struct fuse_dispatcher fdi; fdisp_init(&fdi, sizeof(*fiii)); fdisp_make(&fdi, FUSE_INIT, data->mp, 0, td, NULL); fiii = fdi.indata; fiii->major = FUSE_KERNEL_VERSION; fiii->minor = FUSE_KERNEL_MINOR_VERSION; fiii->max_readahead = FUSE_DEFAULT_IOSIZE * 16; fiii->flags = 0; fuse_insert_callback(fdi.tick, fuse_internal_init_callback); fuse_insert_message(fdi.tick); fdisp_destroy(&fdi); } #ifdef ZERO_PAD_INCOMPLETE_BUFS static int isbzero(void *buf, size_t len) { int i; for (i = 0; i < len; i++) { if (((char *)buf)[i]) return (0); } return (1); } #endif Index: projects/pnfs-server/sys/fs/fuse/fuse_internal.h =================================================================== --- projects/pnfs-server/sys/fs/fuse/fuse_internal.h (revision 297886) +++ projects/pnfs-server/sys/fs/fuse/fuse_internal.h (revision 297887) @@ -1,368 +1,375 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _FUSE_INTERNAL_H_ #define _FUSE_INTERNAL_H_ #include #include #include #include #include "fuse_ipc.h" #include "fuse_node.h" static __inline int vfs_isrdonly(struct mount *mp) { return ((mp->mnt_flag & MNT_RDONLY) != 0 ? 1 : 0); } static __inline struct mount * vnode_mount(struct vnode *vp) { return (vp->v_mount); } static __inline int vnode_mountedhere(struct vnode *vp) { return (vp->v_mountedhere != NULL ? 1 : 0); } static __inline enum vtype vnode_vtype(struct vnode *vp) { return (vp->v_type); } static __inline int vnode_isvroot(struct vnode *vp) { return ((vp->v_vflag & VV_ROOT) != 0 ? 1 : 0); } static __inline int vnode_isreg(struct vnode *vp) { return (vp->v_type == VREG ? 1 : 0); } static __inline int vnode_isdir(struct vnode *vp) { return (vp->v_type == VDIR ? 1 : 0); } static __inline int vnode_islnk(struct vnode *vp) { return (vp->v_type == VLNK ? 1 : 0); } static __inline ssize_t uio_resid(struct uio *uio) { return (uio->uio_resid); } static __inline off_t uio_offset(struct uio *uio) { return (uio->uio_offset); } static __inline void uio_setoffset(struct uio *uio, off_t offset) { uio->uio_offset = offset; } static __inline void uio_setresid(struct uio *uio, ssize_t resid) { uio->uio_resid = resid; } /* miscellaneous */ static __inline__ int fuse_isdeadfs(struct vnode *vp) { struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); return (data->dataflags & FSESS_DEAD); } static __inline__ int fuse_iosize(struct vnode *vp) { return vp->v_mount->mnt_stat.f_iosize; } /* access */ #define FVP_ACCESS_NOOP 0x01 #define FACCESS_VA_VALID 0x01 #define FACCESS_DO_ACCESS 0x02 #define FACCESS_STICKY 0x04 #define FACCESS_CHOWN 0x08 #define FACCESS_NOCHECKSPY 0x10 #define FACCESS_SETGID 0x12 #define FACCESS_XQUERIES FACCESS_STICKY | FACCESS_CHOWN | FACCESS_SETGID struct fuse_access_param { uid_t xuid; gid_t xgid; uint32_t facc_flags; }; static __inline int fuse_match_cred(struct ucred *basecred, struct ucred *usercred) { if (basecred->cr_uid == usercred->cr_uid && basecred->cr_uid == usercred->cr_ruid && basecred->cr_uid == usercred->cr_svuid && basecred->cr_groups[0] == usercred->cr_groups[0] && basecred->cr_groups[0] == usercred->cr_rgid && basecred->cr_groups[0] == usercred->cr_svgid) return 0; return EPERM; } int fuse_internal_access(struct vnode *vp, mode_t mode, struct fuse_access_param *facp, struct thread *td, struct ucred *cred); /* attributes */ static __inline void fuse_internal_attr_fat2vat(struct mount *mp, struct fuse_attr *fat, struct vattr *vap) { DEBUGX(FUSE_DEBUG_INTERNAL, "node #%ju, mode 0%o\n", (uintmax_t)fat->ino, fat->mode); vattr_null(vap); vap->va_fsid = mp->mnt_stat.f_fsid.val[0]; vap->va_fileid = fat->ino; /* XXX cast from 64 bits to 32 */ vap->va_mode = fat->mode & ~S_IFMT; vap->va_nlink = fat->nlink; vap->va_uid = fat->uid; vap->va_gid = fat->gid; vap->va_rdev = fat->rdev; vap->va_size = fat->size; vap->va_atime.tv_sec = fat->atime; /* XXX on some platforms cast from 64 bits to 32 */ vap->va_atime.tv_nsec = fat->atimensec; vap->va_mtime.tv_sec = fat->mtime; vap->va_mtime.tv_nsec = fat->mtimensec; vap->va_ctime.tv_sec = fat->ctime; vap->va_ctime.tv_nsec = fat->ctimensec; vap->va_blocksize = PAGE_SIZE; vap->va_type = IFTOVT(fat->mode); #if (S_BLKSIZE == 512) /* Optimize this case */ vap->va_bytes = fat->blocks << 9; #else vap->va_bytes = fat->blocks * S_BLKSIZE; #endif vap->va_flags = 0; } #define cache_attrs(vp, fuse_out) \ fuse_internal_attr_fat2vat(vnode_mount(vp), &(fuse_out)->attr, \ VTOVA(vp)); /* fsync */ int fuse_internal_fsync(struct vnode *vp, struct thread *td, struct ucred *cred, struct fuse_filehandle *fufh); int fuse_internal_fsync_callback(struct fuse_ticket *tick, struct uio *uio); /* readdir */ struct pseudo_dirent { uint32_t d_namlen; }; int fuse_internal_readdir(struct vnode *vp, struct uio *uio, + off_t startoff, struct fuse_filehandle *fufh, - struct fuse_iov *cookediov); + struct fuse_iov *cookediov, + int *ncookies, + u_long *cookies); int fuse_internal_readdir_processdata(struct uio *uio, + off_t startoff, + int *fnd_start, size_t reqsize, void *buf, size_t bufsize, - void *param); + struct fuse_iov *cookediov, + int *ncookies, + u_long **cookies); /* remove */ int fuse_internal_remove(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, enum fuse_opcode op); /* rename */ int fuse_internal_rename(struct vnode *fdvp, struct componentname *fcnp, struct vnode *tdvp, struct componentname *tcnp); /* revoke */ void fuse_internal_vnode_disappear(struct vnode *vp); /* strategy */ /* entity creation */ static __inline int fuse_internal_checkentry(struct fuse_entry_out *feo, enum vtype vtyp) { DEBUGX(FUSE_DEBUG_INTERNAL, "feo=%p, vtype=%d\n", feo, vtyp); if (vtyp != IFTOVT(feo->attr.mode)) { DEBUGX(FUSE_DEBUG_INTERNAL, "EINVAL -- %x != %x\n", vtyp, IFTOVT(feo->attr.mode)); return EINVAL; } if (feo->nodeid == FUSE_NULL_ID) { DEBUGX(FUSE_DEBUG_INTERNAL, "EINVAL -- feo->nodeid is NULL\n"); return EINVAL; } if (feo->nodeid == FUSE_ROOT_ID) { DEBUGX(FUSE_DEBUG_INTERNAL, "EINVAL -- feo->nodeid is FUSE_ROOT_ID\n"); return EINVAL; } return 0; } int fuse_internal_newentry(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, enum fuse_opcode op, void *buf, size_t bufsize, enum vtype vtyp); void fuse_internal_newentry_makerequest(struct mount *mp, uint64_t dnid, struct componentname *cnp, enum fuse_opcode op, void *buf, size_t bufsize, struct fuse_dispatcher *fdip); int fuse_internal_newentry_core(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, enum vtype vtyp, struct fuse_dispatcher *fdip); /* entity destruction */ int fuse_internal_forget_callback(struct fuse_ticket *tick, struct uio *uio); void fuse_internal_forget_send(struct mount *mp, struct thread *td, struct ucred *cred, uint64_t nodeid, uint64_t nlookup); /* fuse start/stop */ int fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio); void fuse_internal_send_init(struct fuse_data *data, struct thread *td); #endif /* _FUSE_INTERNAL_H_ */ Index: projects/pnfs-server/sys/fs/fuse/fuse_io.c =================================================================== --- projects/pnfs-server/sys/fs/fuse/fuse_io.c (revision 297886) +++ projects/pnfs-server/sys/fs/fuse/fuse_io.c (revision 297887) @@ -1,810 +1,817 @@ /* * Copyright (c) 2007-2009 Google Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_node.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_io.h" #define FUSE_DEBUG_MODULE IO #include "fuse_debug.h" static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh); static int fuse_read_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh); static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) { struct fuse_filehandle *fufh; int err, directio; + fufh_type_t fufhtype; MPASS(vp->v_type == VREG || vp->v_type == VDIR); - err = fuse_filehandle_getrw(vp, - (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); - if (err) { - printf("FUSE: io dispatch: filehandles are closed\n"); - return err; - } /* * Ideally, when the daemon asks for direct io at open time, the * standard file flag should be set according to this, so that would * just change the default mode, which later on could be changed via * fcntl(2). * But this doesn't work, the O_DIRECT flag gets cleared at some point * (don't know where). So to make any use of the Fuse direct_io option, * we hardwire it into the file's private data (similarly to Linux, * btw.). */ directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); + if (uio->uio_rw == UIO_READ) + fufhtype = FUFH_RDONLY; + else if (directio != 0) + fufhtype = FUFH_WRONLY; + else + /* Buffer cache writing might read a block in. */ + fufhtype = FUFH_RDWR; + err = fuse_filehandle_getrw(vp, fufhtype, &fufh); + if (err) { + printf("FUSE: io dispatch: filehandles are closed\n"); + return err; + } switch (uio->uio_rw) { case UIO_READ: if (directio) { FS_DEBUG("direct read of vnode %ju via file handle %ju\n", (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id); err = fuse_read_directbackend(vp, uio, cred, fufh); } else { FS_DEBUG("buffered read of vnode %ju\n", (uintmax_t)VTOILLU(vp)); err = fuse_read_biobackend(vp, uio, cred, fufh); } break; case UIO_WRITE: if (directio) { FS_DEBUG("direct write of vnode %ju via file handle %ju\n", (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id); err = fuse_write_directbackend(vp, uio, cred, fufh); } else { FS_DEBUG("buffered write of vnode %ju\n", (uintmax_t)VTOILLU(vp)); err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag); } break; default: panic("uninterpreted mode passed to fuse_io_dispatch"); } return (err); } static int fuse_read_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh) { struct buf *bp; daddr_t lbn; int bcount; int err = 0, n = 0, on = 0; off_t filesize; const int biosize = fuse_iosize(vp); FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n", uio->uio_resid, uio->uio_offset, VTOFUD(vp)->filesize); if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); bcount = MIN(MAXBSIZE, biosize); filesize = VTOFUD(vp)->filesize; do { if (fuse_isdeadfs(vp)) { err = ENXIO; break; } lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); FS_DEBUG2G("biosize %d, lbn %d, on %d\n", biosize, (int)lbn, on); /* * Obtain the buffer cache block. Figure out the buffer size * when we are at EOF. If we are modifying the size of the * buffer based on an EOF condition we need to hold * nfs_rslock() through obtaining the buffer to prevent * a potential writer-appender from messing with n_size. * Otherwise we may accidently truncate the buffer and * lose dirty data. * * Note that bcount is *not* DEV_BSIZE aligned. */ if ((off_t)lbn * biosize >= filesize) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { bcount = filesize - (off_t)lbn *biosize; } bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); if (!bp) return (EINTR); /* * If B_CACHE is not set, we must issue the read. If this * fails, we return an error. */ if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); err = fuse_io_strategy(vp, bp); if (err) { brelse(bp); return (err); } } /* * on is the offset into the current bp. Figure out how many * bytes we can copy out of the bp. Note that bcount is * NOT DEV_BSIZE aligned. * * Then figure out how many bytes we can copy into the uio. */ n = 0; if (on < bcount) n = MIN((unsigned)(bcount - on), uio->uio_resid); if (n > 0) { FS_DEBUG2G("feeding buffeater with %d bytes of buffer %p," " saying %d was asked for\n", n, bp->b_data + on, n + (int)bp->b_resid); err = uiomove(bp->b_data + on, n, uio); } brelse(bp); FS_DEBUG2G("end of turn, err %d, uio->uio_resid %zd, n %d\n", err, uio->uio_resid, n); } while (err == 0 && uio->uio_resid > 0 && n > 0); return (err); } static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh) { struct fuse_dispatcher fdi; struct fuse_read_in *fri; int err = 0; if (uio->uio_resid == 0) return (0); fdisp_init(&fdi, 0); /* * XXX In "normal" case we use an intermediate kernel buffer for * transmitting data from daemon's context to ours. Eventually, we should * get rid of this. Anyway, if the target uio lives in sysspace (we are * called from pageops), and the input data doesn't need kernel-side * processing (we are not called from readdir) we can already invoke * an optimized, "peer-to-peer" I/O routine. */ while (uio->uio_resid > 0) { fdi.iosize = sizeof(*fri); fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); fri = fdi.indata; fri->fh = fufh->fh_id; fri->offset = uio->uio_offset; fri->size = MIN(uio->uio_resid, fuse_get_mpdata(vp->v_mount)->max_read); FS_DEBUG2G("fri->fh %ju, fri->offset %ju, fri->size %ju\n", (uintmax_t)fri->fh, (uintmax_t)fri->offset, (uintmax_t)fri->size); if ((err = fdisp_wait_answ(&fdi))) goto out; FS_DEBUG2G("complete: got iosize=%d, requested fri.size=%zd; " "resid=%zd offset=%ju\n", fri->size, fdi.iosize, uio->uio_resid, (uintmax_t)uio->uio_offset); if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) break; if (fdi.iosize < fri->size) break; } out: fdisp_destroy(&fdi); return (err); } static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_write_in *fwi; struct fuse_dispatcher fdi; size_t chunksize; int diff; int err = 0; if (!uio->uio_resid) return (0); fdisp_init(&fdi, 0); while (uio->uio_resid > 0) { chunksize = MIN(uio->uio_resid, fuse_get_mpdata(vp->v_mount)->max_write); fdi.iosize = sizeof(*fwi) + chunksize; fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); fwi = fdi.indata; fwi->fh = fufh->fh_id; fwi->offset = uio->uio_offset; fwi->size = chunksize; if ((err = uiomove((char *)fdi.indata + sizeof(*fwi), chunksize, uio))) break; if ((err = fdisp_wait_answ(&fdi))) break; diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size; if (diff < 0) { err = EINVAL; break; } uio->uio_resid += diff; uio->uio_offset -= diff; if (uio->uio_offset > fvdat->filesize) fuse_vnode_setsize(vp, cred, uio->uio_offset); } fdisp_destroy(&fdi); return (err); } static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct buf *bp; daddr_t lbn; int bcount; int n, on, err = 0; const int biosize = fuse_iosize(vp); KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n", uio->uio_resid, uio->uio_offset, fvdat->filesize); if (vp->v_type != VREG) return (EIO); if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (0); if (ioflag & IO_APPEND) uio_setoffset(uio, fvdat->filesize); /* * Find all of this file's B_NEEDCOMMIT buffers. If our writes * would exceed the local maximum per-file write commit size when * combined with those, we must decide whether to flush, * go synchronous, or return err. We don't bother checking * IO_UNIT -- we just make all writes atomic anyway, as there's * no point optimizing for something that really won't ever happen. */ do { if (fuse_isdeadfs(vp)) { err = ENXIO; break; } lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); n = MIN((unsigned)(biosize - on), uio->uio_resid); FS_DEBUG2G("lbn %ju, on %d, n %d, uio offset %ju, uio resid %zd\n", (uintmax_t)lbn, on, n, (uintmax_t)uio->uio_offset, uio->uio_resid); again: /* * Handle direct append and file extension cases, calculate * unaligned buffer size. */ if (uio->uio_offset == fvdat->filesize && n) { /* * Get the buffer (in its pre-append state to maintain * B_CACHE if it was previously set). Resize the * nfsnode after we have locked the buffer to prevent * readers from reading garbage. */ bcount = on; FS_DEBUG("getting block from OS, bcount %d\n", bcount); bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); if (bp != NULL) { long save; err = fuse_vnode_setsize(vp, cred, uio->uio_offset + n); if (err) { brelse(bp); break; } save = bp->b_flags & B_CACHE; bcount += n; allocbuf(bp, bcount); bp->b_flags |= save; } } else { /* * Obtain the locked cache block first, and then * adjust the file's size as appropriate. */ bcount = on + n; if ((off_t)lbn * biosize + bcount < fvdat->filesize) { if ((off_t)(lbn + 1) * biosize < fvdat->filesize) bcount = biosize; else bcount = fvdat->filesize - (off_t)lbn *biosize; } FS_DEBUG("getting block from OS, bcount %d\n", bcount); bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); if (bp && uio->uio_offset + n > fvdat->filesize) { err = fuse_vnode_setsize(vp, cred, uio->uio_offset + n); if (err) { brelse(bp); break; } } } if (!bp) { err = EINTR; break; } /* * Issue a READ if B_CACHE is not set. In special-append * mode, B_CACHE is based on the buffer prior to the write * op and is typically set, avoiding the read. If a read * is required in special append mode, the server will * probably send us a short-read since we extended the file * on our end, resulting in b_resid == 0 and, thusly, * B_CACHE getting set. * * We can also avoid issuing the read if the write covers * the entire buffer. We have to make sure the buffer state * is reasonable in this case since we will not be initiating * I/O. See the comments in kern/vfs_bio.c's getblk() for * more information. * * B_CACHE may also be set due to the buffer being cached * normally. */ if (on == 0 && n == bcount) { bp->b_flags |= B_CACHE; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; } if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); fuse_io_strategy(vp, bp); if ((err = bp->b_error)) { brelse(bp); break; } } if (bp->b_wcred == NOCRED) bp->b_wcred = crhold(cred); /* * If dirtyend exceeds file size, chop it down. This should * not normally occur but there is an append race where it * might occur XXX, so we log it. * * If the chopping creates a reverse-indexed or degenerate * situation with dirtyoff/end, we 0 both of them. */ if (bp->b_dirtyend > bcount) { FS_DEBUG("FUSE append race @%lx:%d\n", (long)bp->b_blkno * biosize, bp->b_dirtyend - bcount); bp->b_dirtyend = bcount; } if (bp->b_dirtyoff >= bp->b_dirtyend) bp->b_dirtyoff = bp->b_dirtyend = 0; /* * If the new write will leave a contiguous dirty * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. * * While it is possible to merge discontiguous writes due to * our having a B_CACHE buffer ( and thus valid read data * for the hole), we don't because it could lead to * significant cache coherency problems with multiple clients, * especially if locking is implemented later on. * * as an optimization we could theoretically maintain * a linked list of discontinuous areas, but we would still * have to commit them separately so there isn't much * advantage to it except perhaps a bit of asynchronization. */ if (bp->b_dirtyend > 0 && (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { /* * Yes, we mean it. Write out everything to "storage" * immediatly, without hesitation. (Apart from other * reasons: the only way to know if a write is valid * if its actually written out.) */ bwrite(bp); if (bp->b_error == EINTR) { err = EINTR; break; } goto again; } err = uiomove((char *)bp->b_data + on, n, uio); /* * Since this block is being modified, it must be written * again and not just committed. Since write clustering does * not work for the stage 1 data write, only the stage 2 * commit rpc, we have to clear B_CLUSTEROK as well. */ bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); if (err) { bp->b_ioflags |= BIO_ERROR; bp->b_error = err; brelse(bp); break; } /* * Only update dirtyoff/dirtyend if not a degenerate * condition. */ if (n) { if (bp->b_dirtyend > 0) { bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); } else { bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } vfs_bio_set_valid(bp, on, n); } err = bwrite(bp); if (err) break; } while (uio->uio_resid > 0 && n > 0); if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0) fuse_vnode_savesize(vp, cred); return (err); } int fuse_io_strategy(struct vnode *vp, struct buf *bp) { struct fuse_filehandle *fufh; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct ucred *cred; struct uio *uiop; struct uio uio; struct iovec io; int error = 0; const int biosize = fuse_iosize(vp); MPASS(vp->v_type == VREG || vp->v_type == VDIR); MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); FS_DEBUG("inode=%ju offset=%jd resid=%ld\n", (uintmax_t)VTOI(vp), (intmax_t)(((off_t)bp->b_blkno) * biosize), bp->b_bcount); error = fuse_filehandle_getrw(vp, (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); if (error) { printf("FUSE: strategy: filehandles are closed\n"); bp->b_ioflags |= BIO_ERROR; bp->b_error = error; return (error); } cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; uiop = &uio; uiop->uio_iov = &io; uiop->uio_iovcnt = 1; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = curthread; /* * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We * do this here so we do not have to do it in all the code that * calls us. */ bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; KASSERT(!(bp->b_flags & B_DONE), ("fuse_io_strategy: bp %p already marked done", bp)); if (bp->b_iocmd == BIO_READ) { io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; error = fuse_read_directbackend(vp, uiop, cred, fufh); if ((!error && uiop->uio_resid) || (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO && uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 && uiop->uio_offset >= fvdat->cached_attrs.va_size)) { /* * If we had a short read with no error, we must have * hit a file hole. We should zero-fill the remainder. * This can also occur if the server hits the file EOF. * * Holes used to be able to occur due to pending * writes, but that is not possible any longer. */ int nread = bp->b_bcount - uiop->uio_resid; int left = uiop->uio_resid; if (error != 0) { printf("FUSE: Fix broken io: offset %ju, " " resid %zd, file size %ju/%ju\n", (uintmax_t)uiop->uio_offset, uiop->uio_resid, fvdat->filesize, fvdat->cached_attrs.va_size); error = 0; } if (left > 0) bzero((char *)bp->b_data + nread, left); uiop->uio_resid = 0; } if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } } else { /* * If we only need to commit, try to commit */ if (bp->b_flags & B_NEEDCOMMIT) { FS_DEBUG("write: B_NEEDCOMMIT flags set\n"); } /* * Setup for actual write */ if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > fvdat->filesize) bp->b_dirtyend = fvdat->filesize - (off_t)bp->b_blkno * biosize; if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; uiop->uio_offset = (off_t)bp->b_blkno * biosize + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; error = fuse_write_directbackend(vp, uiop, cred, fufh); if (error == EINTR || error == ETIMEDOUT || (!error && (bp->b_flags & B_NEEDCOMMIT))) { bp->b_flags &= ~(B_INVAL | B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); bp->b_flags &= ~B_DONE; } if ((error == EINTR || error == ETIMEDOUT) && (bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; } else { if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_flags |= B_INVAL; bp->b_error = error; } bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; bufdone(bp); return (0); } } bp->b_resid = uiop->uio_resid; bufdone(bp); return (error); } int fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td) { struct vop_fsync_args a = { .a_vp = vp, .a_waitfor = waitfor, .a_td = td, }; return (vop_stdfsync(&a)); } /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. */ int fuse_io_invalbuf(struct vnode *vp, struct thread *td) { struct fuse_vnode_data *fvdat = VTOFUD(vp); int error = 0; if (vp->v_iflag & VI_DOOMED) return 0; ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf"); while (fvdat->flag & FN_FLUSHINPROG) { struct proc *p = td->td_proc; if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) return EIO; fvdat->flag |= FN_FLUSHWANT; tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz); error = 0; if (p != NULL) { PROC_LOCK(p); if (SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist)) error = EINTR; PROC_UNLOCK(p); } if (error == EINTR) return EINTR; } fvdat->flag |= FN_FLUSHINPROG; if (vp->v_bufobj.bo_object != NULL) { VM_OBJECT_WLOCK(vp->v_bufobj.bo_object); vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object); } error = vinvalbuf(vp, V_SAVE, PCATCH, 0); while (error) { if (error == ERESTART || error == EINTR) { fvdat->flag &= ~FN_FLUSHINPROG; if (fvdat->flag & FN_FLUSHWANT) { fvdat->flag &= ~FN_FLUSHWANT; wakeup(&fvdat->flag); } return EINTR; } error = vinvalbuf(vp, V_SAVE, PCATCH, 0); } fvdat->flag &= ~FN_FLUSHINPROG; if (fvdat->flag & FN_FLUSHWANT) { fvdat->flag &= ~FN_FLUSHWANT; wakeup(&fvdat->flag); } return (error); } Index: projects/pnfs-server/sys/fs/fuse/fuse_ipc.c =================================================================== --- projects/pnfs-server/sys/fs/fuse/fuse_ipc.c (revision 297886) +++ projects/pnfs-server/sys/fs/fuse/fuse_ipc.c (revision 297887) @@ -1,904 +1,905 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_node.h" #include "fuse_ipc.h" #include "fuse_internal.h" #define FUSE_DEBUG_MODULE IPC #include "fuse_debug.h" static struct fuse_ticket *fticket_alloc(struct fuse_data *data); static void fticket_refresh(struct fuse_ticket *ftick); static void fticket_destroy(struct fuse_ticket *ftick); static int fticket_wait_answer(struct fuse_ticket *ftick); static __inline__ int fticket_aw_pull_uio(struct fuse_ticket *ftick, struct uio *uio); static int fuse_body_audit(struct fuse_ticket *ftick, size_t blen); static __inline__ void fuse_setup_ihead(struct fuse_in_header *ihead, struct fuse_ticket *ftick, uint64_t nid, enum fuse_opcode op, size_t blen, pid_t pid, struct ucred *cred); static fuse_handler_t fuse_standard_handler; SYSCTL_NODE(_vfs, OID_AUTO, fuse, CTLFLAG_RW, 0, "FUSE tunables"); SYSCTL_STRING(_vfs_fuse, OID_AUTO, version, CTLFLAG_RD, FUSE_FREEBSD_VERSION, 0, "fuse-freebsd version"); static int fuse_ticket_count = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, ticket_count, CTLFLAG_RW, &fuse_ticket_count, 0, "number of allocated tickets"); static long fuse_iov_permanent_bufsize = 1 << 19; SYSCTL_LONG(_vfs_fuse, OID_AUTO, iov_permanent_bufsize, CTLFLAG_RW, &fuse_iov_permanent_bufsize, 0, "limit for permanently stored buffer size for fuse_iovs"); static int fuse_iov_credit = 16; SYSCTL_INT(_vfs_fuse, OID_AUTO, iov_credit, CTLFLAG_RW, &fuse_iov_credit, 0, "how many times is an oversized fuse_iov tolerated"); MALLOC_DEFINE(M_FUSEMSG, "fuse_msgbuf", "fuse message buffer"); static uma_zone_t ticket_zone; static void fuse_block_sigs(sigset_t *oldset) { sigset_t newset; SIGFILLSET(newset); SIGDELSET(newset, SIGKILL); if (kern_sigprocmask(curthread, SIG_BLOCK, &newset, oldset, 0)) panic("%s: Invalid operation for kern_sigprocmask()", __func__); } static void fuse_restore_sigs(sigset_t *oldset) { if (kern_sigprocmask(curthread, SIG_SETMASK, oldset, NULL, 0)) panic("%s: Invalid operation for kern_sigprocmask()", __func__); } void fiov_init(struct fuse_iov *fiov, size_t size) { uint32_t msize = FU_AT_LEAST(size); debug_printf("fiov=%p, size=%zd\n", fiov, size); fiov->len = 0; fiov->base = malloc(msize, M_FUSEMSG, M_WAITOK | M_ZERO); fiov->allocated_size = msize; fiov->credit = fuse_iov_credit; } void fiov_teardown(struct fuse_iov *fiov) { debug_printf("fiov=%p\n", fiov); MPASS(fiov->base != NULL); free(fiov->base, M_FUSEMSG); } void fiov_adjust(struct fuse_iov *fiov, size_t size) { debug_printf("fiov=%p, size=%zd\n", fiov, size); if (fiov->allocated_size < size || (fuse_iov_permanent_bufsize >= 0 && fiov->allocated_size - size > fuse_iov_permanent_bufsize && --fiov->credit < 0)) { fiov->base = realloc(fiov->base, FU_AT_LEAST(size), M_FUSEMSG, M_WAITOK | M_ZERO); if (!fiov->base) { panic("FUSE: realloc failed"); } fiov->allocated_size = FU_AT_LEAST(size); fiov->credit = fuse_iov_credit; } fiov->len = size; } void fiov_refresh(struct fuse_iov *fiov) { debug_printf("fiov=%p\n", fiov); bzero(fiov->base, fiov->len); fiov_adjust(fiov, 0); } static int fticket_ctor(void *mem, int size, void *arg, int flags) { struct fuse_ticket *ftick = mem; struct fuse_data *data = arg; debug_printf("ftick=%p data=%p\n", ftick, data); FUSE_ASSERT_MS_DONE(ftick); FUSE_ASSERT_AW_DONE(ftick); ftick->tk_data = data; if (ftick->tk_unique != 0) fticket_refresh(ftick); /* May be truncated to 32 bits */ ftick->tk_unique = atomic_fetchadd_long(&data->ticketer, 1); if (ftick->tk_unique == 0) ftick->tk_unique = atomic_fetchadd_long(&data->ticketer, 1); refcount_init(&ftick->tk_refcount, 1); atomic_add_acq_int(&fuse_ticket_count, 1); return 0; } static void fticket_dtor(void *mem, int size, void *arg) { struct fuse_ticket *ftick = mem; debug_printf("ftick=%p\n", ftick); FUSE_ASSERT_MS_DONE(ftick); FUSE_ASSERT_AW_DONE(ftick); atomic_subtract_acq_int(&fuse_ticket_count, 1); } static int fticket_init(void *mem, int size, int flags) { struct fuse_ticket *ftick = mem; FS_DEBUG("ftick=%p\n", ftick); bzero(ftick, sizeof(struct fuse_ticket)); fiov_init(&ftick->tk_ms_fiov, sizeof(struct fuse_in_header)); ftick->tk_ms_type = FT_M_FIOV; mtx_init(&ftick->tk_aw_mtx, "fuse answer delivery mutex", NULL, MTX_DEF); fiov_init(&ftick->tk_aw_fiov, 0); ftick->tk_aw_type = FT_A_FIOV; return 0; } static void fticket_fini(void *mem, int size) { struct fuse_ticket *ftick = mem; FS_DEBUG("ftick=%p\n", ftick); fiov_teardown(&ftick->tk_ms_fiov); fiov_teardown(&ftick->tk_aw_fiov); mtx_destroy(&ftick->tk_aw_mtx); } static __inline struct fuse_ticket * fticket_alloc(struct fuse_data *data) { return uma_zalloc_arg(ticket_zone, data, M_WAITOK); } static __inline void fticket_destroy(struct fuse_ticket *ftick) { return uma_zfree(ticket_zone, ftick); } static __inline__ void fticket_refresh(struct fuse_ticket *ftick) { debug_printf("ftick=%p\n", ftick); FUSE_ASSERT_MS_DONE(ftick); FUSE_ASSERT_AW_DONE(ftick); fiov_refresh(&ftick->tk_ms_fiov); ftick->tk_ms_bufdata = NULL; ftick->tk_ms_bufsize = 0; ftick->tk_ms_type = FT_M_FIOV; bzero(&ftick->tk_aw_ohead, sizeof(struct fuse_out_header)); fiov_refresh(&ftick->tk_aw_fiov); ftick->tk_aw_errno = 0; ftick->tk_aw_bufdata = NULL; ftick->tk_aw_bufsize = 0; ftick->tk_aw_type = FT_A_FIOV; ftick->tk_flag = 0; } static int fticket_wait_answer(struct fuse_ticket *ftick) { sigset_t tset; int err = 0; struct fuse_data *data; debug_printf("ftick=%p\n", ftick); fuse_lck_mtx_lock(ftick->tk_aw_mtx); if (fticket_answered(ftick)) { goto out; } data = ftick->tk_data; if (fdata_get_dead(data)) { err = ENOTCONN; fticket_set_answered(ftick); goto out; } fuse_block_sigs(&tset); err = msleep(ftick, &ftick->tk_aw_mtx, PCATCH, "fu_ans", data->daemon_timeout * hz); fuse_restore_sigs(&tset); if (err == EAGAIN) { /* same as EWOULDBLOCK */ #ifdef XXXIP /* die conditionally */ if (!fdata_get_dead(data)) { fdata_set_dead(data); } #endif err = ETIMEDOUT; fticket_set_answered(ftick); } out: if (!(err || fticket_answered(ftick))) { debug_printf("FUSE: requester was woken up but still no answer"); err = ENXIO; } fuse_lck_mtx_unlock(ftick->tk_aw_mtx); return err; } static __inline__ int fticket_aw_pull_uio(struct fuse_ticket *ftick, struct uio *uio) { int err = 0; size_t len = uio_resid(uio); debug_printf("ftick=%p, uio=%p\n", ftick, uio); if (len) { switch (ftick->tk_aw_type) { case FT_A_FIOV: fiov_adjust(fticket_resp(ftick), len); err = uiomove(fticket_resp(ftick)->base, len, uio); if (err) { debug_printf("FUSE: FT_A_FIOV: error is %d" " (%p, %zd, %p)\n", err, fticket_resp(ftick)->base, len, uio); } break; case FT_A_BUF: ftick->tk_aw_bufsize = len; err = uiomove(ftick->tk_aw_bufdata, len, uio); if (err) { debug_printf("FUSE: FT_A_BUF: error is %d" " (%p, %zd, %p)\n", err, ftick->tk_aw_bufdata, len, uio); } break; default: panic("FUSE: unknown answer type for ticket %p", ftick); } } return err; } int fticket_pull(struct fuse_ticket *ftick, struct uio *uio) { int err = 0; debug_printf("ftick=%p, uio=%p\n", ftick, uio); if (ftick->tk_aw_ohead.error) { return 0; } err = fuse_body_audit(ftick, uio_resid(uio)); if (!err) { err = fticket_aw_pull_uio(ftick, uio); } return err; } struct fuse_data * fdata_alloc(struct cdev *fdev, struct ucred *cred) { struct fuse_data *data; debug_printf("fdev=%p\n", fdev); data = malloc(sizeof(struct fuse_data), M_FUSEMSG, M_WAITOK | M_ZERO); data->fdev = fdev; mtx_init(&data->ms_mtx, "fuse message list mutex", NULL, MTX_DEF); STAILQ_INIT(&data->ms_head); mtx_init(&data->aw_mtx, "fuse answer list mutex", NULL, MTX_DEF); TAILQ_INIT(&data->aw_head); data->daemoncred = crhold(cred); data->daemon_timeout = FUSE_DEFAULT_DAEMON_TIMEOUT; sx_init(&data->rename_lock, "fuse rename lock"); data->ref = 1; return data; } void fdata_trydestroy(struct fuse_data *data) { FS_DEBUG("data=%p data.mp=%p data.fdev=%p data.flags=%04x\n", data, data->mp, data->fdev, data->dataflags); FS_DEBUG("destroy: data=%p\n", data); data->ref--; MPASS(data->ref >= 0); if (data->ref != 0) return; /* Driving off stage all that stuff thrown at device... */ mtx_destroy(&data->ms_mtx); mtx_destroy(&data->aw_mtx); sx_destroy(&data->rename_lock); crfree(data->daemoncred); free(data, M_FUSEMSG); } void fdata_set_dead(struct fuse_data *data) { debug_printf("data=%p\n", data); FUSE_LOCK(); if (fdata_get_dead(data)) { FUSE_UNLOCK(); return; } fuse_lck_mtx_lock(data->ms_mtx); data->dataflags |= FSESS_DEAD; wakeup_one(data); selwakeuppri(&data->ks_rsel, PZERO + 1); wakeup(&data->ticketer); fuse_lck_mtx_unlock(data->ms_mtx); FUSE_UNLOCK(); } struct fuse_ticket * fuse_ticket_fetch(struct fuse_data *data) { int err = 0; struct fuse_ticket *ftick; debug_printf("data=%p\n", data); ftick = fticket_alloc(data); if (!(data->dataflags & FSESS_INITED)) { /* Sleep until get answer for INIT messsage */ FUSE_LOCK(); if (!(data->dataflags & FSESS_INITED) && data->ticketer > 2) { err = msleep(&data->ticketer, &fuse_mtx, PCATCH | PDROP, "fu_ini", 0); if (err) fdata_set_dead(data); } else FUSE_UNLOCK(); } return ftick; } int fuse_ticket_drop(struct fuse_ticket *ftick) { int die; die = refcount_release(&ftick->tk_refcount); debug_printf("ftick=%p refcount=%d\n", ftick, ftick->tk_refcount); if (die) fticket_destroy(ftick); return die; } void fuse_insert_callback(struct fuse_ticket *ftick, fuse_handler_t * handler) { debug_printf("ftick=%p, handler=%p data=%p\n", ftick, ftick->tk_data, handler); if (fdata_get_dead(ftick->tk_data)) { return; } ftick->tk_aw_handler = handler; fuse_lck_mtx_lock(ftick->tk_data->aw_mtx); fuse_aw_push(ftick); fuse_lck_mtx_unlock(ftick->tk_data->aw_mtx); } void fuse_insert_message(struct fuse_ticket *ftick) { debug_printf("ftick=%p\n", ftick); if (ftick->tk_flag & FT_DIRTY) { panic("FUSE: ticket reused without being refreshed"); } ftick->tk_flag |= FT_DIRTY; if (fdata_get_dead(ftick->tk_data)) { return; } fuse_lck_mtx_lock(ftick->tk_data->ms_mtx); fuse_ms_push(ftick); wakeup_one(ftick->tk_data); selwakeuppri(&ftick->tk_data->ks_rsel, PZERO + 1); fuse_lck_mtx_unlock(ftick->tk_data->ms_mtx); } static int fuse_body_audit(struct fuse_ticket *ftick, size_t blen) { int err = 0; enum fuse_opcode opcode; + struct fuse_getxattr_in *fgin; debug_printf("ftick=%p, blen = %zu\n", ftick, blen); opcode = fticket_opcode(ftick); switch (opcode) { case FUSE_LOOKUP: err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; break; case FUSE_FORGET: panic("FUSE: a handler has been intalled for FUSE_FORGET"); break; case FUSE_GETATTR: err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL; break; case FUSE_SETATTR: err = (blen == sizeof(struct fuse_attr_out)) ? 0 : EINVAL; break; case FUSE_READLINK: err = (PAGE_SIZE >= blen) ? 0 : EINVAL; break; case FUSE_SYMLINK: err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; break; case FUSE_MKNOD: err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; break; case FUSE_MKDIR: err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; break; case FUSE_UNLINK: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_RMDIR: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_RENAME: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_LINK: err = (blen == sizeof(struct fuse_entry_out)) ? 0 : EINVAL; break; case FUSE_OPEN: err = (blen == sizeof(struct fuse_open_out)) ? 0 : EINVAL; break; case FUSE_READ: err = (((struct fuse_read_in *)( (char *)ftick->tk_ms_fiov.base + sizeof(struct fuse_in_header) ))->size >= blen) ? 0 : EINVAL; break; case FUSE_WRITE: err = (blen == sizeof(struct fuse_write_out)) ? 0 : EINVAL; break; case FUSE_STATFS: if (fuse_libabi_geq(ftick->tk_data, 7, 4)) { err = (blen == sizeof(struct fuse_statfs_out)) ? 0 : EINVAL; } else { err = (blen == FUSE_COMPAT_STATFS_SIZE) ? 0 : EINVAL; } break; case FUSE_RELEASE: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_FSYNC: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_SETXATTR: - panic("FUSE_SETXATTR implementor has forgotten to define a" - " response body format check"); + err = (blen == 0) ? 0 : EINVAL; break; case FUSE_GETXATTR: - panic("FUSE_GETXATTR implementor has forgotten to define a" - " response body format check"); - break; - case FUSE_LISTXATTR: - panic("FUSE_LISTXATTR implementor has forgotten to define a" - " response body format check"); + fgin = (struct fuse_getxattr_in *) + ((char *)ftick->tk_ms_fiov.base + + sizeof(struct fuse_in_header)); + if (fgin->size == 0) + err = (blen == sizeof(struct fuse_getxattr_out)) ? 0 : + EINVAL; + else + err = (blen <= fgin->size) ? 0 : EINVAL; break; case FUSE_REMOVEXATTR: - panic("FUSE_REMOVEXATTR implementor has forgotten to define a" - " response body format check"); + err = (blen == 0) ? 0 : EINVAL; break; case FUSE_FLUSH: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_INIT: if (blen == sizeof(struct fuse_init_out) || blen == 8) { err = 0; } else { err = EINVAL; } break; case FUSE_OPENDIR: err = (blen == sizeof(struct fuse_open_out)) ? 0 : EINVAL; break; case FUSE_READDIR: err = (((struct fuse_read_in *)( (char *)ftick->tk_ms_fiov.base + sizeof(struct fuse_in_header) ))->size >= blen) ? 0 : EINVAL; break; case FUSE_RELEASEDIR: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_FSYNCDIR: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_GETLK: - panic("FUSE: no response body format check for FUSE_GETLK"); + err = (blen == sizeof(struct fuse_lk_out)) ? 0 : EINVAL; break; case FUSE_SETLK: - panic("FUSE: no response body format check for FUSE_SETLK"); + err = (blen == 0) ? 0 : EINVAL; break; case FUSE_SETLKW: - panic("FUSE: no response body format check for FUSE_SETLKW"); + err = (blen == 0) ? 0 : EINVAL; break; case FUSE_ACCESS: err = (blen == 0) ? 0 : EINVAL; break; case FUSE_CREATE: err = (blen == sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)) ? 0 : EINVAL; break; case FUSE_DESTROY: err = (blen == 0) ? 0 : EINVAL; break; default: panic("FUSE: opcodes out of sync (%d)\n", opcode); } return err; } static void fuse_setup_ihead(struct fuse_in_header *ihead, struct fuse_ticket *ftick, uint64_t nid, enum fuse_opcode op, size_t blen, pid_t pid, struct ucred *cred) { ihead->len = sizeof(*ihead) + blen; ihead->unique = ftick->tk_unique; ihead->nodeid = nid; ihead->opcode = op; debug_printf("ihead=%p, ftick=%p, nid=%ju, op=%d, blen=%zu\n", ihead, ftick, (uintmax_t)nid, op, blen); ihead->pid = pid; ihead->uid = cred->cr_uid; ihead->gid = cred->cr_rgid; } /* * fuse_standard_handler just pulls indata and wakes up pretender. * Doesn't try to interpret data, that's left for the pretender. * Though might do a basic size verification before the pull-in takes place */ static int fuse_standard_handler(struct fuse_ticket *ftick, struct uio *uio) { int err = 0; debug_printf("ftick=%p, uio=%p\n", ftick, uio); err = fticket_pull(ftick, uio); fuse_lck_mtx_lock(ftick->tk_aw_mtx); if (!fticket_answered(ftick)) { fticket_set_answered(ftick); ftick->tk_aw_errno = err; wakeup(ftick); } fuse_lck_mtx_unlock(ftick->tk_aw_mtx); return err; } void fdisp_make_pid(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, pid_t pid, struct ucred *cred) { struct fuse_data *data = fuse_get_mpdata(mp); debug_printf("fdip=%p, op=%d, mp=%p, nid=%ju\n", fdip, op, mp, (uintmax_t)nid); if (fdip->tick) { fticket_refresh(fdip->tick); } else { fdip->tick = fuse_ticket_fetch(data); } FUSE_DIMALLOC(&fdip->tick->tk_ms_fiov, fdip->finh, fdip->indata, fdip->iosize); fuse_setup_ihead(fdip->finh, fdip->tick, nid, op, fdip->iosize, pid, cred); } void fdisp_make(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct mount *mp, uint64_t nid, struct thread *td, struct ucred *cred) { RECTIFY_TDCR(td, cred); return fdisp_make_pid(fdip, op, mp, nid, td->td_proc->p_pid, cred); } void fdisp_make_vp(struct fuse_dispatcher *fdip, enum fuse_opcode op, struct vnode *vp, struct thread *td, struct ucred *cred) { debug_printf("fdip=%p, op=%d, vp=%p\n", fdip, op, vp); RECTIFY_TDCR(td, cred); return fdisp_make_pid(fdip, op, vnode_mount(vp), VTOI(vp), td->td_proc->p_pid, cred); } int fdisp_wait_answ(struct fuse_dispatcher *fdip) { int err = 0; fdip->answ_stat = 0; fuse_insert_callback(fdip->tick, fuse_standard_handler); fuse_insert_message(fdip->tick); if ((err = fticket_wait_answer(fdip->tick))) { debug_printf("IPC: interrupted, err = %d\n", err); fuse_lck_mtx_lock(fdip->tick->tk_aw_mtx); if (fticket_answered(fdip->tick)) { /* * Just between noticing the interrupt and getting here, * the standard handler has completed his job. * So we drop the ticket and exit as usual. */ debug_printf("IPC: already answered\n"); fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx); goto out; } else { /* * So we were faster than the standard handler. * Then by setting the answered flag we get *him* * to drop the ticket. */ debug_printf("IPC: setting to answered\n"); fticket_set_answered(fdip->tick); fuse_lck_mtx_unlock(fdip->tick->tk_aw_mtx); return err; } } debug_printf("IPC: not interrupted, err = %d\n", err); if (fdip->tick->tk_aw_errno) { debug_printf("IPC: explicit EIO-ing, tk_aw_errno = %d\n", fdip->tick->tk_aw_errno); err = EIO; goto out; } if ((err = fdip->tick->tk_aw_ohead.error)) { debug_printf("IPC: setting status to %d\n", fdip->tick->tk_aw_ohead.error); /* * This means a "proper" fuse syscall error. * We record this value so the caller will * be able to know it's not a boring messaging * failure, if she wishes so (and if not, she can * just simply propagate the return value of this routine). * [XXX Maybe a bitflag would do the job too, * if other flags needed, this will be converted thusly.] */ fdip->answ_stat = err; goto out; } fdip->answ = fticket_resp(fdip->tick)->base; fdip->iosize = fticket_resp(fdip->tick)->len; debug_printf("IPC: all is well\n"); return 0; out: debug_printf("IPC: dropping ticket, err = %d\n", err); return err; } void fuse_ipc_init(void) { ticket_zone = uma_zcreate("fuse_ticket", sizeof(struct fuse_ticket), fticket_ctor, fticket_dtor, fticket_init, fticket_fini, UMA_ALIGN_PTR, 0); } void fuse_ipc_destroy(void) { uma_zdestroy(ticket_zone); } Index: projects/pnfs-server/sys/fs/fuse/fuse_node.c =================================================================== --- projects/pnfs-server/sys/fs/fuse/fuse_node.c (revision 297886) +++ projects/pnfs-server/sys/fs/fuse/fuse_node.c (revision 297887) @@ -1,387 +1,396 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_node.h" #include "fuse_internal.h" #include "fuse_io.h" #include "fuse_ipc.h" #define FUSE_DEBUG_MODULE VNOPS #include "fuse_debug.h" MALLOC_DEFINE(M_FUSEVN, "fuse_vnode", "fuse vnode private data"); static int fuse_node_count = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, node_count, CTLFLAG_RD, &fuse_node_count, 0, ""); int fuse_data_cache_enable = 1; SYSCTL_INT(_vfs_fuse, OID_AUTO, data_cache_enable, CTLFLAG_RW, &fuse_data_cache_enable, 0, ""); int fuse_data_cache_invalidate = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, data_cache_invalidate, CTLFLAG_RW, &fuse_data_cache_invalidate, 0, ""); int fuse_mmap_enable = 1; SYSCTL_INT(_vfs_fuse, OID_AUTO, mmap_enable, CTLFLAG_RW, &fuse_mmap_enable, 0, ""); int fuse_refresh_size = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, refresh_size, CTLFLAG_RW, &fuse_refresh_size, 0, ""); int fuse_sync_resize = 1; SYSCTL_INT(_vfs_fuse, OID_AUTO, sync_resize, CTLFLAG_RW, &fuse_sync_resize, 0, ""); int fuse_fix_broken_io = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, fix_broken_io, CTLFLAG_RW, &fuse_fix_broken_io, 0, ""); static void fuse_vnode_init(struct vnode *vp, struct fuse_vnode_data *fvdat, uint64_t nodeid, enum vtype vtyp) { int i; fvdat->nid = nodeid; if (nodeid == FUSE_ROOT_ID) { vp->v_vflag |= VV_ROOT; } vp->v_type = vtyp; vp->v_data = fvdat; for (i = 0; i < FUFH_MAXTYPE; i++) fvdat->fufh[i].fh_type = FUFH_INVALID; atomic_add_acq_int(&fuse_node_count, 1); } void fuse_vnode_destroy(struct vnode *vp) { struct fuse_vnode_data *fvdat = vp->v_data; vp->v_data = NULL; free(fvdat, M_FUSEVN); atomic_subtract_acq_int(&fuse_node_count, 1); } static int fuse_vnode_cmp(struct vnode *vp, void *nidp) { return (VTOI(vp) != *((uint64_t *)nidp)); } static uint32_t __inline fuse_vnode_hash(uint64_t id) { return (fnv_32_buf(&id, sizeof(id), FNV1_32_INIT)); } -static int +int fuse_vnode_alloc(struct mount *mp, struct thread *td, uint64_t nodeid, enum vtype vtyp, struct vnode **vpp) { struct fuse_vnode_data *fvdat; struct vnode *vp2; int err = 0; FS_DEBUG("been asked for vno #%ju\n", (uintmax_t)nodeid); if (vtyp == VNON) { return EINVAL; } *vpp = NULL; err = vfs_hash_get(mp, fuse_vnode_hash(nodeid), LK_EXCLUSIVE, td, vpp, fuse_vnode_cmp, &nodeid); if (err) return (err); if (*vpp) { MPASS((*vpp)->v_type == vtyp && (*vpp)->v_data != NULL); FS_DEBUG("vnode taken from hash\n"); return (0); } fvdat = malloc(sizeof(*fvdat), M_FUSEVN, M_WAITOK | M_ZERO); err = getnewvnode("fuse", mp, &fuse_vnops, vpp); if (err) { free(fvdat, M_FUSEVN); return (err); } lockmgr((*vpp)->v_vnlock, LK_EXCLUSIVE, NULL); fuse_vnode_init(*vpp, fvdat, nodeid, vtyp); err = insmntque(*vpp, mp); ASSERT_VOP_ELOCKED(*vpp, "fuse_vnode_alloc"); if (err) { free(fvdat, M_FUSEVN); *vpp = NULL; return (err); } err = vfs_hash_insert(*vpp, fuse_vnode_hash(nodeid), LK_EXCLUSIVE, td, &vp2, fuse_vnode_cmp, &nodeid); if (err) return (err); if (vp2 != NULL) { *vpp = vp2; return (0); } ASSERT_VOP_ELOCKED(*vpp, "fuse_vnode_alloc"); return (0); } int fuse_vnode_get(struct mount *mp, uint64_t nodeid, struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, enum vtype vtyp) { struct thread *td = (cnp != NULL ? cnp->cn_thread : curthread); int err = 0; debug_printf("dvp=%p\n", dvp); err = fuse_vnode_alloc(mp, td, nodeid, vtyp, vpp); if (err) { return err; } if (dvp != NULL) { MPASS((cnp->cn_flags & ISDOTDOT) == 0); MPASS(!(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')); fuse_vnode_setparent(*vpp, dvp); } if (dvp != NULL && cnp != NULL && (cnp->cn_flags & MAKEENTRY) != 0) { ASSERT_VOP_LOCKED(*vpp, "fuse_vnode_get"); ASSERT_VOP_LOCKED(dvp, "fuse_vnode_get"); cache_enter(dvp, *vpp, cnp); } /* * In userland, libfuse uses cached lookups for dot and dotdot entries, * thus it does not really bump the nlookup counter for forget. * Follow the same semantic and avoid tu bump it in order to keep * nlookup counters consistent. */ if (cnp == NULL || ((cnp->cn_flags & ISDOTDOT) == 0 && (cnp->cn_namelen != 1 || cnp->cn_nameptr[0] != '.'))) VTOFUD(*vpp)->nlookup++; return 0; } void fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags, struct thread *td) { /* * Funcation is called for every vnode open. * Merge fuse_open_flags it may be 0 * * XXXIP: Handle FOPEN_KEEP_CACHE */ /* * Ideally speaking, direct io should be enabled on * fd's but do not see of any way of providing that * this implementation. * * Also cannot think of a reason why would two * different fd's on same vnode would like * have DIRECT_IO turned on and off. But linux * based implementation works on an fd not an * inode and provides such a feature. * * XXXIP: Handle fd based DIRECT_IO */ if (fuse_open_flags & FOPEN_DIRECT_IO) { - VTOFUD(vp)->flag |= FN_DIRECTIO; + ASSERT_VOP_ELOCKED(vp, __func__); + /* + * If switching from buffer cache I/O to direct I/O, the + * buffer cache blocks must be invalidated to avoid accessing + * stale data in the buffer cache. + */ + if ((VTOFUD(vp)->flag & FN_DIRECTIO) == 0) { + VTOFUD(vp)->flag |= FN_DIRECTIO; + fuse_io_invalbuf(vp, td); + } } else { VTOFUD(vp)->flag &= ~FN_DIRECTIO; } if (vnode_vtype(vp) == VREG) { /* XXXIP prevent getattr, by using cached node size */ vnode_create_vobject(vp, 0, td); } } int fuse_vnode_savesize(struct vnode *vp, struct ucred *cred) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct thread *td = curthread; struct fuse_filehandle *fufh = NULL; struct fuse_dispatcher fdi; struct fuse_setattr_in *fsai; int err = 0; FS_DEBUG("inode=%ju size=%ju\n", (uintmax_t)VTOI(vp), (uintmax_t)fvdat->filesize); ASSERT_VOP_ELOCKED(vp, "fuse_io_extend"); if (fuse_isdeadfs(vp)) { return EBADF; } if (vnode_vtype(vp) == VDIR) { return EISDIR; } if (vfs_isrdonly(vnode_mount(vp))) { return EROFS; } if (cred == NULL) { cred = td->td_ucred; } fdisp_init(&fdi, sizeof(*fsai)); fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred); fsai = fdi.indata; fsai->valid = 0; /* Truncate to a new value. */ fsai->size = fvdat->filesize; fsai->valid |= FATTR_SIZE; fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh); if (fufh) { fsai->fh = fufh->fh_id; fsai->valid |= FATTR_FH; } err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); if (err == 0) fvdat->flag &= ~FN_SIZECHANGE; return err; } void fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct vattr va; if ((fvdat->flag & FN_SIZECHANGE) != 0 || (fuse_refresh_size == 0 && fvdat->filesize != 0)) return; VOP_GETATTR(vp, &va, cred); FS_DEBUG("refreshed file size: %jd\n", (intmax_t)VTOFUD(vp)->filesize); } int fuse_vnode_setsize(struct vnode *vp, struct ucred *cred, off_t newsize) { struct fuse_vnode_data *fvdat = VTOFUD(vp); off_t oldsize; int err = 0; FS_DEBUG("inode=%ju oldsize=%ju newsize=%ju\n", (uintmax_t)VTOI(vp), (uintmax_t)fvdat->filesize, (uintmax_t)newsize); ASSERT_VOP_ELOCKED(vp, "fuse_vnode_setsize"); oldsize = fvdat->filesize; fvdat->filesize = newsize; fvdat->flag |= FN_SIZECHANGE; if (newsize < oldsize) { err = vtruncbuf(vp, cred, newsize, fuse_iosize(vp)); } vnode_pager_setsize(vp, newsize); return err; } Index: projects/pnfs-server/sys/fs/fuse/fuse_node.h =================================================================== --- projects/pnfs-server/sys/fs/fuse/fuse_node.h (revision 297886) +++ projects/pnfs-server/sys/fs/fuse/fuse_node.h (revision 297887) @@ -1,131 +1,154 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _FUSE_NODE_H_ #define _FUSE_NODE_H_ #include #include #include "fuse_file.h" #define FN_REVOKED 0x00000020 #define FN_FLUSHINPROG 0x00000040 #define FN_FLUSHWANT 0x00000080 #define FN_SIZECHANGE 0x00000100 #define FN_DIRECTIO 0x00000200 struct fuse_vnode_data { /** self **/ uint64_t nid; /** parent **/ /* XXXIP very likely to be stale, it's not updated in rename() */ uint64_t parent_nid; /** I/O **/ struct fuse_filehandle fufh[FUFH_MAXTYPE]; /** flags **/ uint32_t flag; /** meta **/ struct vattr cached_attrs; off_t filesize; uint64_t nlookup; enum vtype vtype; }; #define VTOFUD(vp) \ ((struct fuse_vnode_data *)((vp)->v_data)) #define VTOI(vp) (VTOFUD(vp)->nid) #define VTOVA(vp) (&(VTOFUD(vp)->cached_attrs)) #define VTOILLU(vp) ((uint64_t)(VTOFUD(vp) ? VTOI(vp) : 0)) #define FUSE_NULL_ID 0 extern struct vop_vector fuse_vnops; static __inline void fuse_vnode_setparent(struct vnode *vp, struct vnode *dvp) { if (dvp != NULL && vp->v_type == VDIR) { MPASS(dvp->v_type == VDIR); VTOFUD(vp)->parent_nid = VTOI(dvp); } } void fuse_vnode_destroy(struct vnode *vp); +int fuse_vnode_alloc(struct mount *mp, + struct thread *td, + uint64_t nodeid, + enum vtype vtyp, + struct vnode **vpp); + int fuse_vnode_get(struct mount *mp, uint64_t nodeid, struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, enum vtype vtyp); void fuse_vnode_open(struct vnode *vp, int32_t fuse_open_flags, struct thread *td); void fuse_vnode_refreshsize(struct vnode *vp, struct ucred *cred); int fuse_vnode_savesize(struct vnode *vp, struct ucred *cred); int fuse_vnode_setsize(struct vnode *vp, struct ucred *cred, off_t newsize); + +/* + * Since making a structure that is a file system specific "struct fid" + * is too big, due to alignment issues, this structure is copied into the + * fid_data field. fid_data0 in "struct fid" is used for the vnode type. + * Until MAXFIDSZ is increased, only the first half of this structure will + * fit in fid_data. This is fine for GlusterFS, since it always sets the + * "generation" to 0. As such, the rest is #ifdef notyet. + */ +struct fuse_fid_data { + uint64_t ffid_nid; + uint64_t ffid_parent_nid; +#ifdef notyet + uint64_t ffid_nid_gen; + uint64_t ffid_parent_nid_gen; +#endif +}; #endif /* _FUSE_NODE_H_ */ Index: projects/pnfs-server/sys/fs/fuse/fuse_vfsops.c =================================================================== --- projects/pnfs-server/sys/fs/fuse/fuse_vfsops.c (revision 297886) +++ projects/pnfs-server/sys/fs/fuse/fuse_vfsops.c (revision 297887) @@ -1,534 +1,572 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_param.h" #include "fuse_node.h" #include "fuse_ipc.h" #include "fuse_internal.h" #include #include #define FUSE_DEBUG_MODULE VFSOPS #include "fuse_debug.h" /* This will do for privilege types for now */ #ifndef PRIV_VFS_FUSE_ALLOWOTHER #define PRIV_VFS_FUSE_ALLOWOTHER PRIV_VFS_MOUNT_NONUSER #endif #ifndef PRIV_VFS_FUSE_MOUNT_NONUSER #define PRIV_VFS_FUSE_MOUNT_NONUSER PRIV_VFS_MOUNT_NONUSER #endif #ifndef PRIV_VFS_FUSE_SYNC_UNMOUNT #define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER #endif static vfs_mount_t fuse_vfsop_mount; static vfs_unmount_t fuse_vfsop_unmount; static vfs_root_t fuse_vfsop_root; static vfs_statfs_t fuse_vfsop_statfs; +static vfs_fhtovp_t fuse_vfsop_fhtovp; struct vfsops fuse_vfsops = { .vfs_mount = fuse_vfsop_mount, .vfs_unmount = fuse_vfsop_unmount, .vfs_root = fuse_vfsop_root, .vfs_statfs = fuse_vfsop_statfs, + .vfs_fhtovp = fuse_vfsop_fhtovp, }; SYSCTL_INT(_vfs_fuse, OID_AUTO, init_backgrounded, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 1, "indicate async handshake"); static int fuse_enforce_dev_perms = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, enforce_dev_perms, CTLFLAG_RW, &fuse_enforce_dev_perms, 0, "enforce fuse device permissions for secondary mounts"); static unsigned sync_unmount = 1; SYSCTL_UINT(_vfs_fuse, OID_AUTO, sync_unmount, CTLFLAG_RW, &sync_unmount, 0, "specify when to use synchronous unmount"); MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer"); static int fuse_getdevice(const char *fspec, struct thread *td, struct cdev **fdevp) { struct nameidata nd, *ndp = &nd; struct vnode *devvp; struct cdev *fdev; int err; /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fspec, td); if ((err = namei(ndp)) != 0) return err; NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (devvp->v_type != VCHR) { vrele(devvp); return ENXIO; } fdev = devvp->v_rdev; dev_ref(fdev); if (fuse_enforce_dev_perms) { /* * Check if mounter can open the fuse device. * * This has significance only if we are doing a secondary mount * which doesn't involve actually opening fuse devices, but we * still want to enforce the permissions of the device (in * order to keep control over the circle of fuse users). * * (In case of primary mounts, we are either the superuser so * we can do anything anyway, or we can mount only if the * device is already opened by us, ie. we are permitted to open * the device.) */ #if 0 #ifdef MAC err = mac_check_vnode_open(td->td_ucred, devvp, VREAD | VWRITE); if (!err) #endif #endif /* 0 */ err = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (err) { vrele(devvp); dev_rel(fdev); return err; } } /* * according to coda code, no extra lock is needed -- * although in sys/vnode.h this field is marked "v" */ vrele(devvp); if (!fdev->si_devsw || strcmp("fuse", fdev->si_devsw->d_name)) { dev_rel(fdev); return ENXIO; } *fdevp = fdev; return 0; } #define FUSE_FLAGOPT(fnam, fval) do { \ vfs_flagopt(opts, #fnam, &mntopts, fval); \ vfs_flagopt(opts, "__" #fnam, &__mntopts, fval); \ } while (0) static int fuse_vfsop_mount(struct mount *mp) { int err; uint64_t mntopts, __mntopts; int max_read_set; uint32_t max_read; int daemon_timeout; int fd; size_t len; struct cdev *fdev; struct fuse_data *data; struct thread *td; struct file *fp, *fptmp; char *fspec, *subtype; struct vfsoptlist *opts; cap_rights_t rights; subtype = NULL; max_read_set = 0; max_read = ~0; err = 0; mntopts = 0; __mntopts = 0; td = curthread; fuse_trace_printf_vfsop(); + /* + * Allow MNT_UPDATE only so the mountd can set exports on the file + * system. + */ if (mp->mnt_flag & MNT_UPDATE) - return EOPNOTSUPP; + return (0); MNT_ILOCK(mp); mp->mnt_flag |= MNT_SYNCHRONOUS; mp->mnt_data = NULL; MNT_IUNLOCK(mp); /* Get the new options passed to mount */ opts = mp->mnt_optnew; if (!opts) return EINVAL; /* `fspath' contains the mount point (eg. /mnt/fuse/sshfs); REQUIRED */ if (!vfs_getopts(opts, "fspath", &err)) return err; /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */ fspec = vfs_getopts(opts, "from", &err); if (!fspec) return err; /* `fd' contains the filedescriptor for this session; REQUIRED */ if (vfs_scanopt(opts, "fd", "%d", &fd) != 1) return EINVAL; err = fuse_getdevice(fspec, td, &fdev); if (err != 0) return err; /* * With the help of underscored options the mount program * can inform us from the flags it sets by default */ FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY); FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN); FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS); FUSE_FLAGOPT(no_attrcache, FSESS_NO_ATTRCACHE); FUSE_FLAGOPT(no_readahed, FSESS_NO_READAHEAD); FUSE_FLAGOPT(no_datacache, FSESS_NO_DATACACHE); FUSE_FLAGOPT(no_namecache, FSESS_NO_NAMECACHE); FUSE_FLAGOPT(no_mmap, FSESS_NO_MMAP); FUSE_FLAGOPT(brokenio, FSESS_BROKENIO); if (vfs_scanopt(opts, "max_read=", "%u", &max_read) == 1) max_read_set = 1; if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) { if (daemon_timeout < FUSE_MIN_DAEMON_TIMEOUT) daemon_timeout = FUSE_MIN_DAEMON_TIMEOUT; else if (daemon_timeout > FUSE_MAX_DAEMON_TIMEOUT) daemon_timeout = FUSE_MAX_DAEMON_TIMEOUT; } else { daemon_timeout = FUSE_DEFAULT_DAEMON_TIMEOUT; } subtype = vfs_getopts(opts, "subtype=", &err); FS_DEBUG2G("mntopts 0x%jx\n", (uintmax_t)mntopts); err = fget(td, fd, cap_rights_init(&rights, CAP_READ), &fp); if (err != 0) { FS_DEBUG("invalid or not opened device: data=%p\n", data); goto out; } fptmp = td->td_fpop; td->td_fpop = fp; err = devfs_get_cdevpriv((void **)&data); td->td_fpop = fptmp; fdrop(fp, td); FUSE_LOCK(); if (err != 0 || data == NULL || data->mp != NULL) { FS_DEBUG("invalid or not opened device: data=%p data.mp=%p\n", data, data != NULL ? data->mp : NULL); err = ENXIO; FUSE_UNLOCK(); goto out; } if (fdata_get_dead(data)) { FS_DEBUG("device is dead during mount: data=%p\n", data); err = ENOTCONN; FUSE_UNLOCK(); goto out; } /* Sanity + permission checks */ if (!data->daemoncred) panic("fuse daemon found, but identity unknown"); if (mntopts & FSESS_DAEMON_CAN_SPY) err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER); if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid) /* are we allowed to do the first mount? */ err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER); if (err) { FUSE_UNLOCK(); goto out; } data->ref++; data->mp = mp; data->dataflags |= mntopts; data->max_read = max_read; data->daemon_timeout = daemon_timeout; FUSE_UNLOCK(); vfs_getnewfsid(mp); MNT_ILOCK(mp); mp->mnt_data = data; mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_USES_BCACHE; MNT_IUNLOCK(mp); /* We need this here as this slot is used by getnewvnode() */ mp->mnt_stat.f_iosize = PAGE_SIZE; if (subtype) { strlcat(mp->mnt_stat.f_fstypename, ".", MFSNAMELEN); strlcat(mp->mnt_stat.f_fstypename, subtype, MFSNAMELEN); } copystr(fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &len); bzero(mp->mnt_stat.f_mntfromname + len, MNAMELEN - len); FS_DEBUG2G("mp %p: %s\n", mp, mp->mnt_stat.f_mntfromname); /* Now handshaking with daemon */ fuse_internal_send_init(data, td); out: if (err) { FUSE_LOCK(); if (data->mp == mp) { /* * Destroy device only if we acquired reference to * it */ FS_DEBUG("mount failed, destroy device: data=%p mp=%p" " err=%d\n", data, mp, err); data->mp = NULL; fdata_trydestroy(data); } FUSE_UNLOCK(); dev_rel(fdev); } return err; } static int fuse_vfsop_unmount(struct mount *mp, int mntflags) { int err = 0; int flags = 0; struct cdev *fdev; struct fuse_data *data; struct fuse_dispatcher fdi; struct thread *td = curthread; fuse_trace_printf_vfsop(); if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; } data = fuse_get_mpdata(mp); if (!data) { panic("no private data for mount point?"); } /* There is 1 extra root vnode reference (mp->mnt_data). */ FUSE_LOCK(); if (data->vroot != NULL) { struct vnode *vroot = data->vroot; data->vroot = NULL; FUSE_UNLOCK(); vrele(vroot); } else FUSE_UNLOCK(); err = vflush(mp, 0, flags, td); if (err) { debug_printf("vflush failed"); return err; } if (fdata_get_dead(data)) { goto alreadydead; } fdisp_init(&fdi, 0); fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL); err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); fdata_set_dead(data); alreadydead: FUSE_LOCK(); data->mp = NULL; fdev = data->fdev; fdata_trydestroy(data); FUSE_UNLOCK(); MNT_ILOCK(mp); mp->mnt_data = NULL; mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); dev_rel(fdev); return 0; } static int fuse_vfsop_root(struct mount *mp, int lkflags, struct vnode **vpp) { struct fuse_data *data = fuse_get_mpdata(mp); int err = 0; if (data->vroot != NULL) { err = vget(data->vroot, lkflags, curthread); if (err == 0) *vpp = data->vroot; } else { err = fuse_vnode_get(mp, FUSE_ROOT_ID, NULL, vpp, NULL, VDIR); if (err == 0) { FUSE_LOCK(); MPASS(data->vroot == NULL || data->vroot == *vpp); if (data->vroot == NULL) { FS_DEBUG("new root vnode\n"); data->vroot = *vpp; FUSE_UNLOCK(); vref(*vpp); } else if (data->vroot != *vpp) { FS_DEBUG("root vnode race\n"); FUSE_UNLOCK(); VOP_UNLOCK(*vpp, 0); vrele(*vpp); vrecycle(*vpp); *vpp = data->vroot; } else FUSE_UNLOCK(); } } return err; } static int fuse_vfsop_statfs(struct mount *mp, struct statfs *sbp) { struct fuse_dispatcher fdi; int err = 0; struct fuse_statfs_out *fsfo; struct fuse_data *data; FS_DEBUG2G("mp %p: %s\n", mp, mp->mnt_stat.f_mntfromname); data = fuse_get_mpdata(mp); if (!(data->dataflags & FSESS_INITED)) goto fake; fdisp_init(&fdi, 0); fdisp_make(&fdi, FUSE_STATFS, mp, FUSE_ROOT_ID, NULL, NULL); err = fdisp_wait_answ(&fdi); if (err) { fdisp_destroy(&fdi); if (err == ENOTCONN) { /* * We want to seem a legitimate fs even if the daemon * is stiff dead... (so that, eg., we can still do path * based unmounting after the daemon dies). */ goto fake; } return err; } fsfo = fdi.answ; sbp->f_blocks = fsfo->st.blocks; sbp->f_bfree = fsfo->st.bfree; sbp->f_bavail = fsfo->st.bavail; sbp->f_files = fsfo->st.files; sbp->f_ffree = fsfo->st.ffree; /* cast from uint64_t to int64_t */ sbp->f_namemax = fsfo->st.namelen; sbp->f_bsize = fsfo->st.frsize; /* cast from uint32_t to uint64_t */ FS_DEBUG("fuse_statfs_out -- blocks: %llu, bfree: %llu, bavail: %llu, " "fil es: %llu, ffree: %llu, bsize: %i, namelen: %i\n", (unsigned long long)fsfo->st.blocks, (unsigned long long)fsfo->st.bfree, (unsigned long long)fsfo->st.bavail, (unsigned long long)fsfo->st.files, (unsigned long long)fsfo->st.ffree, fsfo->st.bsize, fsfo->st.namelen); fdisp_destroy(&fdi); return 0; fake: sbp->f_blocks = 0; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 0; sbp->f_ffree = 0; sbp->f_namemax = 0; sbp->f_bsize = FUSE_DEFAULT_BLOCKSIZE; return 0; } + +/* + * Translate a file handle into a vnode. fid_data0 is the v_type and + * fid_data is "struct fuse_fid" memcpy()'d so that alignment doesn't + * make the structure too big. + * There is no "generation" field in the file handle. GlusterFS never + * sets it. I don't know if other fuse filesystems do set it, but there + * is no space for it in fhandle_t. + */ +static int +fuse_vfsop_fhtovp(struct mount *mp, struct fid *fhp, int flags, + struct vnode **vpp) +{ + struct fuse_fid_data ffid; + enum vtype vtyp; + int err; + + if (fhp->fid_len != offsetof(struct fid, fid_data) + + sizeof(struct fuse_fid_data)) + return (ESTALE); + vtyp = (enum vtype)fhp->fid_data0; + if (vtyp != VREG && vtyp != VDIR && vtyp != VLNK) + return (ESTALE); + memcpy(&ffid, fhp->fid_data, sizeof(ffid)); + err = fuse_vnode_alloc(mp, curthread, ffid.ffid_nid, vtyp, vpp); + if (err != 0) + return (ESTALE); + if (vtyp == VDIR) + VTOFUD(*vpp)->parent_nid = ffid.ffid_parent_nid; + return (0); +} + Index: projects/pnfs-server/sys/fs/fuse/fuse_vnops.c =================================================================== --- projects/pnfs-server/sys/fs/fuse/fuse_vnops.c (revision 297886) +++ projects/pnfs-server/sys/fs/fuse/fuse_vnops.c (revision 297887) @@ -1,1964 +1,2535 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_param.h" #include "fuse_io.h" #include #define FUSE_DEBUG_MODULE VNOPS #include "fuse_debug.h" /* vnode ops */ static vop_access_t fuse_vnop_access; static vop_close_t fuse_vnop_close; static vop_create_t fuse_vnop_create; static vop_fsync_t fuse_vnop_fsync; static vop_getattr_t fuse_vnop_getattr; static vop_inactive_t fuse_vnop_inactive; static vop_link_t fuse_vnop_link; static vop_lookup_t fuse_vnop_lookup; static vop_mkdir_t fuse_vnop_mkdir; static vop_mknod_t fuse_vnop_mknod; static vop_open_t fuse_vnop_open; static vop_read_t fuse_vnop_read; static vop_readdir_t fuse_vnop_readdir; static vop_readlink_t fuse_vnop_readlink; static vop_reclaim_t fuse_vnop_reclaim; static vop_remove_t fuse_vnop_remove; static vop_rename_t fuse_vnop_rename; static vop_rmdir_t fuse_vnop_rmdir; static vop_setattr_t fuse_vnop_setattr; static vop_strategy_t fuse_vnop_strategy; static vop_symlink_t fuse_vnop_symlink; static vop_write_t fuse_vnop_write; static vop_getpages_t fuse_vnop_getpages; static vop_putpages_t fuse_vnop_putpages; static vop_print_t fuse_vnop_print; +static vop_getextattr_t fuse_vnop_getextattr; +static vop_setextattr_t fuse_vnop_setextattr; +static vop_listextattr_t fuse_vnop_listextattr; +static vop_deleteextattr_t fuse_vnop_deleteextattr; +static vop_advlock_t fuse_vnop_advlock; +static vop_vptofh_t fuse_vnop_vptofh; struct vop_vector fuse_vnops = { .vop_default = &default_vnodeops, .vop_access = fuse_vnop_access, .vop_close = fuse_vnop_close, .vop_create = fuse_vnop_create, .vop_fsync = fuse_vnop_fsync, .vop_getattr = fuse_vnop_getattr, .vop_inactive = fuse_vnop_inactive, .vop_link = fuse_vnop_link, .vop_lookup = fuse_vnop_lookup, .vop_mkdir = fuse_vnop_mkdir, .vop_mknod = fuse_vnop_mknod, .vop_open = fuse_vnop_open, .vop_pathconf = vop_stdpathconf, .vop_read = fuse_vnop_read, .vop_readdir = fuse_vnop_readdir, .vop_readlink = fuse_vnop_readlink, .vop_reclaim = fuse_vnop_reclaim, .vop_remove = fuse_vnop_remove, .vop_rename = fuse_vnop_rename, .vop_rmdir = fuse_vnop_rmdir, .vop_setattr = fuse_vnop_setattr, .vop_strategy = fuse_vnop_strategy, .vop_symlink = fuse_vnop_symlink, .vop_write = fuse_vnop_write, .vop_getpages = fuse_vnop_getpages, .vop_putpages = fuse_vnop_putpages, .vop_print = fuse_vnop_print, + .vop_getextattr = fuse_vnop_getextattr, + .vop_setextattr = fuse_vnop_setextattr, + .vop_listextattr = fuse_vnop_listextattr, + .vop_deleteextattr = fuse_vnop_deleteextattr, + .vop_advlock = fuse_vnop_advlock, + .vop_vptofh = fuse_vnop_vptofh, }; static u_long fuse_lookup_cache_hits = 0; SYSCTL_ULONG(_vfs_fuse, OID_AUTO, lookup_cache_hits, CTLFLAG_RD, &fuse_lookup_cache_hits, 0, ""); static u_long fuse_lookup_cache_misses = 0; SYSCTL_ULONG(_vfs_fuse, OID_AUTO, lookup_cache_misses, CTLFLAG_RD, &fuse_lookup_cache_misses, 0, ""); int fuse_lookup_cache_enable = 1; SYSCTL_INT(_vfs_fuse, OID_AUTO, lookup_cache_enable, CTLFLAG_RW, &fuse_lookup_cache_enable, 0, ""); /* * XXX: This feature is highly experimental and can bring to instabilities, * needs revisiting before to be enabled by default. */ static int fuse_reclaim_revoked = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, reclaim_revoked, CTLFLAG_RW, &fuse_reclaim_revoked, 0, ""); int fuse_pbuf_freecnt = -1; +int fuse_force_directio = 0; + #define fuse_vm_page_lock(m) vm_page_lock((m)); #define fuse_vm_page_unlock(m) vm_page_unlock((m)); #define fuse_vm_page_lock_queues() ((void)0) #define fuse_vm_page_unlock_queues() ((void)0) /* struct vnop_access_args { struct vnode *a_vp; #if VOP_ACCESS_TAKES_ACCMODE_T accmode_t a_accmode; #else int a_mode; #endif struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; int accmode = ap->a_accmode; struct ucred *cred = ap->a_cred; struct fuse_access_param facp; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); int err; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); if (fuse_isdeadfs(vp)) { if (vnode_isvroot(vp)) { return 0; } return ENXIO; } if (!(data->dataflags & FSESS_INITED)) { if (vnode_isvroot(vp)) { if (priv_check_cred(cred, PRIV_VFS_ADMIN, 0) || (fuse_match_cred(data->daemoncred, cred) == 0)) { return 0; } } return EBADF; } if (vnode_islnk(vp)) { return 0; } bzero(&facp, sizeof(facp)); err = fuse_internal_access(vp, accmode, &facp, ap->a_td, ap->a_cred); FS_DEBUG2G("err=%d accmode=0x%x\n", err, accmode); return err; } /* struct vnop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct ucred *cred = ap->a_cred; int fflag = ap->a_fflag; fufh_type_t fufh_type; fuse_trace_printf_vnop(); if (fuse_isdeadfs(vp)) { return 0; } if (vnode_isdir(vp)) { if (fuse_filehandle_valid(vp, FUFH_RDONLY)) { fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); } return 0; } if (fflag & IO_NDELAY) { return 0; } fufh_type = fuse_filehandle_xlate_from_fflags(fflag); if (!fuse_filehandle_valid(vp, fufh_type)) { int i; for (i = 0; i < FUFH_MAXTYPE; i++) if (fuse_filehandle_valid(vp, i)) break; if (i == FUFH_MAXTYPE) panic("FUSE: fufh type %d found to be invalid in close" " (fflag=0x%x)\n", fufh_type, fflag); } if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, cred); } return 0; } /* struct vnop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; struct fuse_open_in *foi; struct fuse_entry_out *feo; struct fuse_dispatcher fdi; struct fuse_dispatcher *fdip = &fdi; int err; struct mount *mp = vnode_mount(dvp); uint64_t parentnid = VTOFUD(dvp)->nid; mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode); uint64_t x_fh_id; uint32_t x_open_flags; fuse_trace_printf_vnop(); if (fuse_isdeadfs(dvp)) { return ENXIO; } bzero(&fdi, sizeof(fdi)); /* XXX: Will we ever want devices ? */ if ((vap->va_type != VREG)) { MPASS(vap->va_type != VFIFO); goto bringup; } debug_printf("parent nid = %ju, mode = %x\n", (uintmax_t)parentnid, mode); fdisp_init(fdip, sizeof(*foi) + cnp->cn_namelen + 1); if (!fsess_isimpl(mp, FUSE_CREATE)) { debug_printf("eh, daemon doesn't implement create?\n"); return (EINVAL); } fdisp_make(fdip, FUSE_CREATE, vnode_mount(dvp), parentnid, td, cred); foi = fdip->indata; foi->mode = mode; foi->flags = O_CREAT | O_RDWR; memcpy((char *)fdip->indata + sizeof(*foi), cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[sizeof(*foi) + cnp->cn_namelen] = '\0'; err = fdisp_wait_answ(fdip); if (err) { if (err == ENOSYS) fsess_set_notimpl(mp, FUSE_CREATE); debug_printf("create: got err=%d from daemon\n", err); goto out; } bringup: feo = fdip->answ; if ((err = fuse_internal_checkentry(feo, VREG))) { goto out; } err = fuse_vnode_get(mp, feo->nodeid, dvp, vpp, cnp, VREG); if (err) { struct fuse_release_in *fri; uint64_t nodeid = feo->nodeid; uint64_t fh_id = ((struct fuse_open_out *)(feo + 1))->fh; fdisp_init(fdip, sizeof(*fri)); fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred); fri = fdip->indata; fri->fh = fh_id; fri->flags = OFLAGS(mode); fuse_insert_callback(fdip->tick, fuse_internal_forget_callback); fuse_insert_message(fdip->tick); return err; } ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create"); fdip->answ = feo + 1; x_fh_id = ((struct fuse_open_out *)(feo + 1))->fh; x_open_flags = ((struct fuse_open_out *)(feo + 1))->open_flags; + if (fuse_force_directio > 0) + x_open_flags |= FOPEN_DIRECT_IO; fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, x_fh_id); fuse_vnode_open(*vpp, x_open_flags, td); cache_purge_negative(dvp); out: fdisp_destroy(fdip); return err; } /* * Our vnop_fsync roughly corresponds to the FUSE_FSYNC method. The Linux * version of FUSE also has a FUSE_FLUSH method. * * On Linux, fsync() synchronizes a file's complete in-core state with that * on disk. The call is not supposed to return until the system has completed * that action or until an error is detected. * * Linux also has an fdatasync() call that is similar to fsync() but is not * required to update the metadata such as access time and modification time. */ /* struct vnop_fsync_args { struct vnodeop_desc *a_desc; struct vnode * a_vp; struct ucred * a_cred; int a_waitfor; struct thread * a_td; }; */ static int fuse_vnop_fsync(struct vop_fsync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_filehandle *fufh; struct fuse_vnode_data *fvdat = VTOFUD(vp); int type, err = 0; fuse_trace_printf_vnop(); if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfsync(ap))) return err; if (!fsess_isimpl(vnode_mount(vp), (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) { goto out; } for (type = 0; type < FUFH_MAXTYPE; type++) { fufh = &(fvdat->fufh[type]); if (FUFH_IS_VALID(fufh)) { fuse_internal_fsync(vp, td, NULL, fufh); } } out: return 0; } /* struct vnop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; struct fuse_vnode_data *fvdat = VTOFUD(vp); int err = 0; int dataflags; struct fuse_dispatcher fdi; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; /* Note that we are not bailing out on a dead file system just yet. */ if (!(dataflags & FSESS_INITED)) { if (!vnode_isvroot(vp)) { fdata_set_dead(fuse_get_mpdata(vnode_mount(vp))); err = ENOTCONN; debug_printf("fuse_getattr b: returning ENOTCONN\n"); return err; } else { goto fake; } } fdisp_init(&fdi, 0); if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) { if ((err == ENOTCONN) && vnode_isvroot(vp)) { /* see comment at similar place in fuse_statfs() */ fdisp_destroy(&fdi); goto fake; } if (err == ENOENT) { fuse_internal_vnode_disappear(vp); } goto out; } cache_attrs(vp, (struct fuse_attr_out *)fdi.answ); if (vap != VTOVA(vp)) { memcpy(vap, VTOVA(vp), sizeof(*vap)); } if (vap->va_type != vnode_vtype(vp)) { fuse_internal_vnode_disappear(vp); err = ENOENT; goto out; } if ((fvdat->flag & FN_SIZECHANGE) != 0) vap->va_size = fvdat->filesize; if (vnode_isreg(vp) && (fvdat->flag & FN_SIZECHANGE) == 0) { /* * This is for those cases when the file size changed without us * knowing, and we want to catch up. */ off_t new_filesize = ((struct fuse_attr_out *) fdi.answ)->attr.size; if (fvdat->filesize != new_filesize) { fuse_vnode_setsize(vp, cred, new_filesize); } } debug_printf("fuse_getattr e: returning 0\n"); out: fdisp_destroy(&fdi); return err; fake: bzero(vap, sizeof(*vap)); vap->va_type = vnode_vtype(vp); return 0; } /* struct vnop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; */ static int fuse_vnop_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh = NULL; int type, need_flush = 1; FS_DEBUG("inode=%ju\n", (uintmax_t)VTOI(vp)); - for (type = 0; type < FUFH_MAXTYPE; type++) { - fufh = &(fvdat->fufh[type]); - if (FUFH_IS_VALID(fufh)) { - if (need_flush && vp->v_type == VREG) { - if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { - fuse_vnode_savesize(vp, NULL); + /* + * For NFS exported mounts, delay buffer cache flushing and the + * close until the vnode is reclaimed. This is done since NFS + * does the open implicitly when a read/write occurs and then + * it reduces the vnode reference count to 0 for each read/write. + */ + if ((vp->v_mount->mnt_flag & MNT_EXPORTED) == 0) { + for (type = 0; type < FUFH_MAXTYPE; type++) { + fufh = &(fvdat->fufh[type]); + if (FUFH_IS_VALID(fufh)) { + if (need_flush && vp->v_type == VREG) { + if ((VTOFUD(vp)->flag & FN_SIZECHANGE) + != 0) { + fuse_vnode_savesize(vp, NULL); + } + if (fuse_data_cache_invalidate || + (fvdat->flag & FN_REVOKED) != 0) + fuse_io_invalbuf(vp, td); + else + fuse_io_flushbuf(vp, MNT_WAIT, + td); + need_flush = 0; } - if (fuse_data_cache_invalidate || - (fvdat->flag & FN_REVOKED) != 0) - fuse_io_invalbuf(vp, td); - else - fuse_io_flushbuf(vp, MNT_WAIT, td); - need_flush = 0; + fuse_filehandle_close(vp, type, td, NULL); } - fuse_filehandle_close(vp, type, td, NULL); } } +#ifdef notnow if ((fvdat->flag & FN_REVOKED) != 0 && fuse_reclaim_revoked) { vrecycle(vp); } +#endif return 0; } /* struct vnop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = VTOVA(vp); struct fuse_dispatcher fdi; struct fuse_entry_out *feo; struct fuse_link_in fli; int err; fuse_trace_printf_vnop(); if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_mount(tdvp) != vnode_mount(vp)) { return EXDEV; } if (vap->va_nlink >= FUSE_LINK_MAX) { return EMLINK; } fli.oldnodeid = VTOI(vp); fdisp_init(&fdi, 0); fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp, FUSE_LINK, &fli, sizeof(fli), &fdi); if ((err = fdisp_wait_answ(&fdi))) { goto out; } feo = fdi.answ; err = fuse_internal_checkentry(feo, vnode_vtype(vp)); out: fdisp_destroy(&fdi); return err; } /* struct vnop_lookup_args { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; */ int fuse_vnop_lookup(struct vop_lookup_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; int nameiop = cnp->cn_nameiop; int flags = cnp->cn_flags; int wantparent = flags & (LOCKPARENT | WANTPARENT); int islastcn = flags & ISLASTCN; struct mount *mp = vnode_mount(dvp); int err = 0; int lookup_err = 0; struct vnode *vp = NULL; struct fuse_dispatcher fdi; enum fuse_opcode op; uint64_t nid; struct fuse_access_param facp; FS_DEBUG2G("parent_inode=%ju - %*s\n", (uintmax_t)VTOI(dvp), (int)cnp->cn_namelen, cnp->cn_nameptr); if (fuse_isdeadfs(dvp)) { *vpp = NULL; return ENXIO; } if (!vnode_isdir(dvp)) { return ENOTDIR; } if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) { return EROFS; } /* * We do access check prior to doing anything else only in the case * when we are at fs root (we'd like to say, "we are at the first * component", but that's not exactly the same... nevermind). * See further comments at further access checks. */ bzero(&facp, sizeof(facp)); if (vnode_isvroot(dvp)) { /* early permission check hack */ if ((err = fuse_internal_access(dvp, VEXEC, &facp, td, cred))) { return err; } } if (flags & ISDOTDOT) { nid = VTOFUD(dvp)->parent_nid; if (nid == 0) { return ENOENT; } fdisp_init(&fdi, 0); op = FUSE_GETATTR; goto calldaemon; } else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') { nid = VTOI(dvp); fdisp_init(&fdi, 0); op = FUSE_GETATTR; goto calldaemon; } else if (fuse_lookup_cache_enable) { err = cache_lookup(dvp, vpp, cnp, NULL, NULL); switch (err) { case -1: /* positive match */ atomic_add_acq_long(&fuse_lookup_cache_hits, 1); return 0; case 0: /* no match in cache */ atomic_add_acq_long(&fuse_lookup_cache_misses, 1); break; case ENOENT: /* negative match */ /* fall through */ default: return err; } } nid = VTOI(dvp); fdisp_init(&fdi, cnp->cn_namelen + 1); op = FUSE_LOOKUP; calldaemon: fdisp_make(&fdi, op, mp, nid, td, cred); if (op == FUSE_LOOKUP) { memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; } lookup_err = fdisp_wait_answ(&fdi); if ((op == FUSE_LOOKUP) && !lookup_err) { /* lookup call succeeded */ nid = ((struct fuse_entry_out *)fdi.answ)->nodeid; if (!nid) { /* * zero nodeid is the same as "not found", * but it's also cacheable (which we keep * keep on doing not as of writing this) */ lookup_err = ENOENT; } else if (nid == FUSE_ROOT_ID) { lookup_err = EINVAL; } } if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT || op != FUSE_LOOKUP)) { fdisp_destroy(&fdi); return lookup_err; } /* lookup_err, if non-zero, must be ENOENT at this point */ if (lookup_err) { if ((nameiop == CREATE || nameiop == RENAME) && islastcn /* && directory dvp has not been removed */ ) { if (vfs_isrdonly(mp)) { err = EROFS; goto out; } #if 0 /* THINK_ABOUT_THIS */ if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) { goto out; } #endif /* * Possibly record the position of a slot in the * directory large enough for the new component name. * This can be recorded in the vnode private data for * dvp. Set the SAVENAME flag to hold onto the * pathname for use later in VOP_CREATE or VOP_RENAME. */ cnp->cn_flags |= SAVENAME; err = EJUSTRETURN; goto out; } /* Consider inserting name into cache. */ /* * No we can't use negative caching, as the fs * changes are out of our control. * False positives' falseness turns out just as things * go by, but false negatives' falseness doesn't. * (and aiding the caching mechanism with extra control * mechanisms comes quite close to beating the whole purpose * caching...) */ #if 0 if ((cnp->cn_flags & MAKEENTRY) != 0) { FS_DEBUG("inserting NULL into cache\n"); cache_enter(dvp, NULL, cnp); } #endif err = ENOENT; goto out; } else { /* !lookup_err */ struct fuse_entry_out *feo = NULL; struct fuse_attr *fattr = NULL; if (op == FUSE_GETATTR) { fattr = &((struct fuse_attr_out *)fdi.answ)->attr; } else { feo = (struct fuse_entry_out *)fdi.answ; fattr = &(feo->attr); } /* * If deleting, and at end of pathname, return parameters * which can be used to remove file. If the wantparent flag * isn't set, we return only the directory, otherwise we go on * and lock the inode, being careful with ".". */ if (nameiop == DELETE && islastcn) { /* * Check for write access on directory. */ facp.xuid = fattr->uid; facp.facc_flags |= FACCESS_STICKY; err = fuse_internal_access(dvp, VWRITE, &facp, td, cred); facp.facc_flags &= ~FACCESS_XQUERIES; if (err) { goto out; } if (nid == VTOI(dvp)) { vref(dvp); *vpp = dvp; } else { err = fuse_vnode_get(dvp->v_mount, nid, dvp, &vp, cnp, IFTOVT(fattr->mode)); if (err) goto out; *vpp = vp; } /* * Save the name for use in VOP_RMDIR and VOP_REMOVE * later. */ cnp->cn_flags |= SAVENAME; goto out; } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && wantparent && islastcn) { #if 0 /* THINK_ABOUT_THIS */ if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) { goto out; } #endif /* * Check for "." */ if (nid == VTOI(dvp)) { err = EISDIR; goto out; } err = fuse_vnode_get(vnode_mount(dvp), nid, dvp, &vp, cnp, IFTOVT(fattr->mode)); if (err) { goto out; } *vpp = vp; /* * Save the name for use in VOP_RENAME later. */ cnp->cn_flags |= SAVENAME; goto out; } if (flags & ISDOTDOT) { struct mount *mp; int ltype; /* * Expanded copy of vn_vget_ino() so that * fuse_vnode_get() can be used. */ mp = dvp->v_mount; ltype = VOP_ISLOCKED(dvp); err = vfs_busy(mp, MBF_NOWAIT); if (err != 0) { vfs_ref(mp); VOP_UNLOCK(dvp, 0); err = vfs_busy(mp, 0); vn_lock(dvp, ltype | LK_RETRY); vfs_rel(mp); if (err) goto out; if ((dvp->v_iflag & VI_DOOMED) != 0) { err = ENOENT; vfs_unbusy(mp); goto out; } } VOP_UNLOCK(dvp, 0); err = fuse_vnode_get(vnode_mount(dvp), nid, NULL, &vp, cnp, IFTOVT(fattr->mode)); vfs_unbusy(mp); vn_lock(dvp, ltype | LK_RETRY); if ((dvp->v_iflag & VI_DOOMED) != 0) { if (err == 0) vput(vp); err = ENOENT; } if (err) goto out; *vpp = vp; } else if (nid == VTOI(dvp)) { vref(dvp); *vpp = dvp; } else { err = fuse_vnode_get(vnode_mount(dvp), nid, dvp, &vp, cnp, IFTOVT(fattr->mode)); if (err) { goto out; } fuse_vnode_setparent(vp, dvp); *vpp = vp; } if (op == FUSE_GETATTR) { cache_attrs(*vpp, (struct fuse_attr_out *)fdi.answ); } else { cache_attrs(*vpp, (struct fuse_entry_out *)fdi.answ); } /* Insert name into cache if appropriate. */ /* * Nooo, caching is evil. With caching, we can't avoid stale * information taking over the playground (cached info is not * just positive/negative, it does have qualitative aspects, * too). And a (VOP/FUSE)_GETATTR is always thrown anyway, when * walking down along cached path components, and that's not * any cheaper than FUSE_LOOKUP. This might change with * implementing kernel side attr caching, but... In Linux, * lookup results are not cached, and the daemon is bombarded * with FUSE_LOOKUPS on and on. This shows that by design, the * daemon is expected to handle frequent lookup queries * efficiently, do its caching in userspace, and so on. * * So just leave the name cache alone. */ /* * Well, now I know, Linux caches lookups, but with a * timeout... So it's the same thing as attribute caching: * we can deal with it when implement timeouts. */ #if 0 if (cnp->cn_flags & MAKEENTRY) { cache_enter(dvp, *vpp, cnp); } #endif } out: if (!lookup_err) { /* No lookup error; need to clean up. */ if (err) { /* Found inode; exit with no vnode. */ if (op == FUSE_LOOKUP) { fuse_internal_forget_send(vnode_mount(dvp), td, cred, nid, 1); } fdisp_destroy(&fdi); return err; } else { #ifndef NO_EARLY_PERM_CHECK_HACK if (!islastcn) { /* * We have the attributes of the next item * *now*, and it's a fact, and we do not * have to do extra work for it (ie, beg the * daemon), and it neither depends on such * accidental things like attr caching. So * the big idea: check credentials *now*, * not at the beginning of the next call to * lookup. * * The first item of the lookup chain (fs root) * won't be checked then here, of course, as * its never "the next". But go and see that * the root is taken care about at the very * beginning of this function. * * Now, given we want to do the access check * this way, one might ask: so then why not * do the access check just after fetching * the inode and its attributes from the * daemon? Why bother with producing the * corresponding vnode at all if something * is not OK? We know what's the deal as * soon as we get those attrs... There is * one bit of info though not given us by * the daemon: whether his response is * authorative or not... His response should * be ignored if something is mounted over * the dir in question. But that can be * known only by having the vnode... */ int tmpvtype = vnode_vtype(*vpp); bzero(&facp, sizeof(facp)); /*the early perm check hack */ facp.facc_flags |= FACCESS_VA_VALID; if ((tmpvtype != VDIR) && (tmpvtype != VLNK)) { err = ENOTDIR; } if (!err && !vnode_mountedhere(*vpp)) { err = fuse_internal_access(*vpp, VEXEC, &facp, td, cred); } if (err) { if (tmpvtype == VLNK) FS_DEBUG("weird, permission error with a symlink?\n"); vput(*vpp); *vpp = NULL; } } #endif } } fdisp_destroy(&fdi); return err; } /* struct vnop_mkdir_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct fuse_mkdir_in fmdi; fuse_trace_printf_vnop(); if (fuse_isdeadfs(dvp)) { return ENXIO; } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi, sizeof(fmdi), VDIR)); } /* struct vnop_mknod_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mknod(struct vop_mknod_args *ap) { return (EINVAL); } /* struct vnop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; / struct file *a_fp; }; */ static int fuse_vnop_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; int mode = ap->a_mode; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; fufh_type_t fufh_type; struct fuse_vnode_data *fvdat; int error, isdir = 0; + int32_t fuse_open_flags; FS_DEBUG2G("inode=%ju mode=0x%x\n", (uintmax_t)VTOI(vp), mode); if (fuse_isdeadfs(vp)) { return ENXIO; } fvdat = VTOFUD(vp); if (vnode_isdir(vp)) { isdir = 1; } + fuse_open_flags = 0; if (isdir) { fufh_type = FUFH_RDONLY; } else { fufh_type = fuse_filehandle_xlate_from_fflags(mode); + if (fufh_type == FUFH_WRONLY || + (fvdat->flag & FN_DIRECTIO) != 0 || + fuse_force_directio > 0) + fuse_open_flags = FOPEN_DIRECT_IO; } if (fuse_filehandle_valid(vp, fufh_type)) { - fuse_vnode_open(vp, 0, td); + fuse_vnode_open(vp, fuse_open_flags, td); return 0; } error = fuse_filehandle_open(vp, fufh_type, NULL, td, cred); return error; } /* struct vnop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; - struct ucred *cred = ap->a_cred; + struct ucred *cred = NULL; + int err, freecred; + gid_t gid; FS_DEBUG2G("inode=%ju offset=%jd resid=%zd\n", (uintmax_t)VTOI(vp), uio->uio_offset, uio->uio_resid); - if (fuse_isdeadfs(vp)) { - return ENXIO; - } + if (fuse_isdeadfs(vp)) + return (ENXIO); - if (VTOFUD(vp)->flag & FN_DIRECTIO) { + if ((VTOFUD(vp)->flag & FN_DIRECTIO) != 0) ioflag |= IO_DIRECT; - } - return fuse_io_dispatch(vp, uio, ioflag, cred); + err = 0; + freecred = 0; + if ((vnode_mount(vp)->mnt_flag & MNT_EXPORTED) != 0 && + fuse_filehandle_validrw(vp, FUFH_RDONLY) == FUFH_INVALID) { + FS_DEBUG("doing open() before read"); + /* + * This should only happen when VOP_READ() is done by an + * nfsd thread. Since the nfsd thread has already done + * permission checks, I believe it is safe to open the + * file as root. + */ + cred = crget(); + cred->cr_uid = cred->cr_ruid = cred->cr_svuid = 0; + gid = 0; + crsetgroups(cred, 1, &gid); + cred->cr_rgid = cred->cr_svgid = cred->cr_groups[0]; + cred->cr_prison = &prison0; + prison_hold(cred->cr_prison); + freecred = 1; + err = fuse_filehandle_open(vp, FUFH_RDONLY, NULL, curthread, + cred); + } else + cred = ap->a_cred; + + if (err == 0) + err = fuse_io_dispatch(vp, uio, ioflag, cred); + +#ifdef notdef + if (freefufh != 0) + fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); +#endif + if (freecred != 0) + crfree(cred); + return (err); } /* struct vnop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; - int *ncookies; + int *a_ncookies; u_long **a_cookies; }; */ static int fuse_vnop_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_filehandle *fufh = NULL; struct fuse_vnode_data *fvdat; struct fuse_iov cookediov; int err = 0; - int freefufh = 0; + int ncookies; + u_long *cookies; + off_t startoff; + ssize_t tresid; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); + *ap->a_eofflag = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } if ( /* XXXIP ((uio_iovcnt(uio) > 1)) || */ (uio_resid(uio) < sizeof(struct dirent))) { return EINVAL; } fvdat = VTOFUD(vp); + tresid = uio->uio_resid; + startoff = uio->uio_offset; if (!fuse_filehandle_valid(vp, FUFH_RDONLY)) { FS_DEBUG("calling readdir() before open()"); err = fuse_filehandle_open(vp, FUFH_RDONLY, &fufh, NULL, cred); - freefufh = 1; + if (err == 0) { + /* + * When a directory is opened, it must be read from + * the beginning. Hopefully, the "startoff" still + * exists as an offset cookie for the directory. + * If not, it will read the entire directory without + * returning any entries and just return eof. + */ + uio->uio_offset = 0; + } } else { err = fuse_filehandle_get(vp, FUFH_RDONLY, &fufh); } if (err) { return (err); } + + if (ap->a_ncookies != NULL) { + ncookies = uio->uio_resid; + ncookies = ncookies / (offsetof(struct dirent, d_name) + 4) + 1; + cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); + *ap->a_ncookies = ncookies; + *ap->a_cookies = cookies; + } else { + ncookies = 0; + cookies = NULL; + } #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1) fiov_init(&cookediov, DIRCOOKEDSIZE); - err = fuse_internal_readdir(vp, uio, fufh, &cookediov); + err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov, + &ncookies, cookies); fiov_teardown(&cookediov); +#ifdef notdef if (freefufh) { fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); } +#endif + if (ap->a_ncookies != NULL) { + if (err == 0) { + *ap->a_ncookies -= ncookies; + } else { + free(*ap->a_cookies, M_TEMP); + *ap->a_ncookies = 0; + *ap->a_cookies = NULL; + } + } + if (err == 0 && tresid == uio->uio_resid) + *ap->a_eofflag = 1; return err; } /* struct vnop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; }; */ static int fuse_vnop_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_dispatcher fdi; int err; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); if (fuse_isdeadfs(vp)) { return ENXIO; } if (!vnode_islnk(vp)) { return EINVAL; } fdisp_init(&fdi, 0); err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred); if (err) { goto out; } if (((char *)fdi.answ)[0] == '/' && fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) { char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname; err = uiomove(mpth, strlen(mpth), uio); } if (!err) { err = uiomove(fdi.answ, fdi.iosize, uio); } out: fdisp_destroy(&fdi); return err; } /* struct vnop_reclaim_args { struct vnode *a_vp; struct thread *a_td; }; */ static int fuse_vnop_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh = NULL; int type; if (!fvdat) { panic("FUSE: no vnode data during recycling"); } FS_DEBUG("inode=%ju\n", (uintmax_t)VTOI(vp)); for (type = 0; type < FUFH_MAXTYPE; type++) { fufh = &(fvdat->fufh[type]); if (FUFH_IS_VALID(fufh)) { - printf("FUSE: vnode being reclaimed but fufh (type=%d) is valid", + FS_DEBUG("FUSE: vnode being reclaimed but fufh (type=%d) is valid", type); fuse_filehandle_close(vp, type, td, NULL); } } if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) { fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp), fvdat->nlookup); } fuse_vnode_setparent(vp, NULL); cache_purge(vp); vfs_hash_remove(vp); vnode_destroy_vobject(vp); fuse_vnode_destroy(vp); return 0; } /* struct vnop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; int err; FS_DEBUG2G("inode=%ju name=%*s\n", (uintmax_t)VTOI(vp), (int)cnp->cn_namelen, cnp->cn_nameptr); if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_isdir(vp)) { return EPERM; } cache_purge(vp); err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); if (err == 0) fuse_internal_vnode_disappear(vp); return err; } /* struct vnop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; }; */ static int fuse_vnop_rename(struct vop_rename_args *ap) { struct vnode *fdvp = ap->a_fdvp; struct vnode *fvp = ap->a_fvp; struct componentname *fcnp = ap->a_fcnp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct fuse_data *data; int err = 0; FS_DEBUG2G("from: inode=%ju name=%*s -> to: inode=%ju name=%*s\n", (uintmax_t)VTOI(fvp), (int)fcnp->cn_namelen, fcnp->cn_nameptr, (uintmax_t)(tvp == NULL ? -1 : VTOI(tvp)), (int)tcnp->cn_namelen, tcnp->cn_nameptr); if (fuse_isdeadfs(fdvp)) { return ENXIO; } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { FS_DEBUG("cross-device rename: %s -> %s\n", fcnp->cn_nameptr, (tcnp != NULL ? tcnp->cn_nameptr : "(NULL)")); err = EXDEV; goto out; } cache_purge(fvp); /* * FUSE library is expected to check if target directory is not * under the source directory in the file system tree. * Linux performs this check at VFS level. */ data = fuse_get_mpdata(vnode_mount(tdvp)); sx_xlock(&data->rename_lock); err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp); if (err == 0) { if (tdvp != fdvp) fuse_vnode_setparent(fvp, tdvp); if (tvp != NULL) fuse_vnode_setparent(tvp, NULL); } sx_unlock(&data->rename_lock); if (tvp != NULL && tvp != fvp) { cache_purge(tvp); } if (vnode_isdir(fvp)) { if ((tvp != NULL) && vnode_isdir(tvp)) { cache_purge(tdvp); } cache_purge(fdvp); } out: if (tdvp == tvp) { vrele(tdvp); } else { vput(tdvp); } if (tvp != NULL) { vput(tvp); } vrele(fdvp); vrele(fvp); return err; } /* struct vnop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } *ap; */ static int fuse_vnop_rmdir(struct vop_rmdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; int err; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp) == VTOFUD(dvp)) { return EINVAL; } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); if (err == 0) fuse_internal_vnode_disappear(vp); return err; } /* struct vnop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; struct fuse_dispatcher fdi; struct fuse_setattr_in *fsai; struct fuse_access_param facp; int err = 0; enum vtype vtyp; int sizechanged = 0; uint64_t newsize = 0; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); if (fuse_isdeadfs(vp)) { return ENXIO; } fdisp_init(&fdi, sizeof(*fsai)); fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred); fsai = fdi.indata; fsai->valid = 0; bzero(&facp, sizeof(facp)); facp.xuid = vap->va_uid; facp.xgid = vap->va_gid; if (vap->va_uid != (uid_t)VNOVAL) { facp.facc_flags |= FACCESS_CHOWN; fsai->uid = vap->va_uid; fsai->valid |= FATTR_UID; } if (vap->va_gid != (gid_t)VNOVAL) { facp.facc_flags |= FACCESS_CHOWN; fsai->gid = vap->va_gid; fsai->valid |= FATTR_GID; } if (vap->va_size != VNOVAL) { struct fuse_filehandle *fufh = NULL; /*Truncate to a new value. */ fsai->size = vap->va_size; sizechanged = 1; newsize = vap->va_size; fsai->valid |= FATTR_SIZE; fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh); if (fufh) { fsai->fh = fufh->fh_id; fsai->valid |= FATTR_FH; } } if (vap->va_atime.tv_sec != VNOVAL) { fsai->atime = vap->va_atime.tv_sec; fsai->atimensec = vap->va_atime.tv_nsec; fsai->valid |= FATTR_ATIME; } if (vap->va_mtime.tv_sec != VNOVAL) { fsai->mtime = vap->va_mtime.tv_sec; fsai->mtimensec = vap->va_mtime.tv_nsec; fsai->valid |= FATTR_MTIME; } if (vap->va_mode != (mode_t)VNOVAL) { fsai->mode = vap->va_mode & ALLPERMS; fsai->valid |= FATTR_MODE; } if (!fsai->valid) { goto out; } vtyp = vnode_vtype(vp); if (fsai->valid & FATTR_SIZE && vtyp == VDIR) { err = EISDIR; goto out; } if (vfs_isrdonly(vnode_mount(vp)) && (fsai->valid & ~FATTR_SIZE || vtyp == VREG)) { err = EROFS; goto out; } if (fsai->valid & ~FATTR_SIZE) { /*err = fuse_internal_access(vp, VADMIN, context, &facp); */ /*XXX */ err = 0; } facp.facc_flags &= ~FACCESS_XQUERIES; if (err && !(fsai->valid & ~(FATTR_ATIME | FATTR_MTIME)) && vap->va_vaflags & VA_UTIMES_NULL) { err = fuse_internal_access(vp, VWRITE, &facp, td, cred); } if (err) goto out; if ((err = fdisp_wait_answ(&fdi))) goto out; vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode); if (vnode_vtype(vp) != vtyp) { if (vnode_vtype(vp) == VNON && vtyp != VNON) { debug_printf("FUSE: Dang! vnode_vtype is VNON and vtype isn't.\n"); } else { /* * STALE vnode, ditch * * The vnode has changed its type "behind our back". There's * nothing really we can do, so let us just force an internal * revocation and tell the caller to try again, if interested. */ fuse_internal_vnode_disappear(vp); err = EAGAIN; } } if (!err && !sizechanged) { cache_attrs(vp, (struct fuse_attr_out *)fdi.answ); } out: fdisp_destroy(&fdi); if (!err && sizechanged) { fuse_vnode_setsize(vp, cred, newsize); VTOFUD(vp)->flag &= ~FN_SIZECHANGE; } return err; } /* struct vnop_strategy_args { struct vnode *a_vp; struct buf *a_bp; }; */ static int fuse_vnop_strategy(struct vop_strategy_args *ap) { struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; fuse_trace_printf_vnop(); if (!vp || fuse_isdeadfs(vp)) { bp->b_ioflags |= BIO_ERROR; bp->b_error = ENXIO; bufdone(bp); return ENXIO; } if (bp->b_iocmd == BIO_WRITE) fuse_vnode_refreshsize(vp, NOCRED); (void)fuse_io_strategy(vp, bp); /* * This is a dangerous function. If returns error, that might mean a * panic. We prefer pretty much anything over being forced to panic * by a malicious daemon (a demon?). So we just return 0 anyway. You * should never mind this: this function has its own error * propagation mechanism via the argument buffer, so * not-that-melodramatic residents of the call chain still will be * able to know what to do. */ return 0; } /* struct vnop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; }; */ static int fuse_vnop_symlink(struct vop_symlink_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; char *target = ap->a_target; struct fuse_dispatcher fdi; int err; size_t len; FS_DEBUG2G("inode=%ju name=%*s\n", (uintmax_t)VTOI(dvp), (int)cnp->cn_namelen, cnp->cn_nameptr); if (fuse_isdeadfs(dvp)) { return ENXIO; } /* * Unlike the other creator type calls, here we have to create a message * where the name of the new entry comes first, and the data describing * the entry comes second. * Hence we can't rely on our handy fuse_internal_newentry() routine, * but put together the message manually and just call the core part. */ len = strlen(target) + 1; fdisp_init(&fdi, len + cnp->cn_namelen + 1); fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len); err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi); fdisp_destroy(&fdi); return err; } /* struct vnop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_write(struct vop_write_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; - struct ucred *cred = ap->a_cred; + struct ucred *cred = NULL; + int err, freecred; + gid_t gid; fuse_trace_printf_vnop(); - if (fuse_isdeadfs(vp)) { - return ENXIO; - } - fuse_vnode_refreshsize(vp, cred); + if (fuse_isdeadfs(vp)) + return (ENXIO); + fuse_vnode_refreshsize(vp, ap->a_cred); - if (VTOFUD(vp)->flag & FN_DIRECTIO) { + if ((VTOFUD(vp)->flag & FN_DIRECTIO) != 0) ioflag |= IO_DIRECT; - } - return fuse_io_dispatch(vp, uio, ioflag, cred); + err = 0; + freecred = 0; + if ((vnode_mount(vp)->mnt_flag & MNT_EXPORTED) != 0 && + !fuse_filehandle_valid(vp, FUFH_RDWR)) { + FS_DEBUG("doing open() before write"); + /* + * This should only happen when VOP_WRITE() is done by an + * nfsd thread. Since the nfsd thread has already done + * permission checks, I believe it is safe to open the + * file as root. + * The open is done FUFH_RDWR and not FUFH_WRONLY because + * a write of a partial buffer cache block will require + * that the block be read in first. + */ + cred = crget(); + cred->cr_uid = cred->cr_ruid = cred->cr_svuid = 0; + gid = 0; + crsetgroups(cred, 1, &gid); + cred->cr_rgid = cred->cr_svgid = cred->cr_groups[0]; + cred->cr_prison = &prison0; + prison_hold(cred->cr_prison); + freecred = 1; + err = fuse_filehandle_open(vp, FUFH_RDWR, NULL, curthread, + cred); + } else + cred = ap->a_cred; + + if (err == 0) + err = fuse_io_dispatch(vp, uio, ioflag, cred); + +#ifdef notdef + if (freefufh != 0) + fuse_filehandle_close(vp, FUFH_RDWR, NULL, cred); +#endif + if (freecred != 0) + crfree(cred); + return (err); } /* struct vnop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; }; */ static int fuse_vnop_getpages(struct vop_getpages_args *ap) { int i, error, nextoff, size, toff, count, npages; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct thread *td; struct ucred *cred; vm_page_t *pages; FS_DEBUG2G("heh\n"); vp = ap->a_vp; KASSERT(vp->v_object, ("objectless vp passed to getpages")); td = curthread; /* XXX */ cred = curthread->td_ucred; /* XXX */ pages = ap->a_m; - npages = ap->a_count; + count = ap->a_count; if (!fsess_opt_mmap(vnode_mount(vp))) { FS_DEBUG("called on non-cacheable vnode??\n"); return (VM_PAGER_ERROR); } + npages = btoc(count); /* - * If the last page is partially valid, just return it and allow - * the pager to zero-out the blanks. Partially valid pages can - * only occur at the file EOF. - * - * XXXGL: is that true for FUSE, which is a local filesystem, - * but still somewhat disconnected from the kernel? + * If the requested page is partially valid, just return it and + * allow the pager to zero-out the blanks. Partially valid pages + * can only occur at the file EOF. */ + VM_OBJECT_WLOCK(vp->v_object); - if (pages[npages - 1]->valid != 0 && --npages == 0) - goto out; + fuse_vm_page_lock_queues(); + if (pages[ap->a_reqpage]->valid != 0) { + for (i = 0; i < npages; ++i) { + if (i != ap->a_reqpage) { + fuse_vm_page_lock(pages[i]); + vm_page_free(pages[i]); + fuse_vm_page_unlock(pages[i]); + } + } + fuse_vm_page_unlock_queues(); + VM_OBJECT_WUNLOCK(vp->v_object); + return 0; + } + fuse_vm_page_unlock_queues(); VM_OBJECT_WUNLOCK(vp->v_object); /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ bp = getpbuf(&fuse_pbuf_freecnt); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, npages); - count = npages << PAGE_SHIFT; iov.iov_base = (caddr_t)kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); relpbuf(bp, &fuse_pbuf_freecnt); if (error && (uio.uio_resid == count)) { FS_DEBUG("error %d\n", error); + VM_OBJECT_WLOCK(vp->v_object); + fuse_vm_page_lock_queues(); + for (i = 0; i < npages; ++i) { + if (i != ap->a_reqpage) { + fuse_vm_page_lock(pages[i]); + vm_page_free(pages[i]); + fuse_vm_page_unlock(pages[i]); + } + } + fuse_vm_page_unlock_queues(); + VM_OBJECT_WUNLOCK(vp->v_object); return VM_PAGER_ERROR; } /* * Calculate the number of bytes read and validate only that number * of bytes. Note that due to pending writes, size may be 0. This * does not mean that the remaining data is invalid! */ size = count - uio.uio_resid; VM_OBJECT_WLOCK(vp->v_object); fuse_vm_page_lock_queues(); for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; if (nextoff <= size) { /* * Read operation filled an entire page */ m->valid = VM_PAGE_BITS_ALL; KASSERT(m->dirty == 0, ("fuse_getpages: page %p is dirty", m)); } else if (size > toff) { /* * Read operation filled a partial page. */ m->valid = 0; vm_page_set_valid_range(m, 0, size - toff); KASSERT(m->dirty == 0, ("fuse_getpages: page %p is dirty", m)); } else { /* * Read operation was short. If no error occured * we may have hit a zero-fill section. We simply * leave valid set to 0. */ ; } + if (i != ap->a_reqpage) + vm_page_readahead_finish(m); } fuse_vm_page_unlock_queues(); -out: VM_OBJECT_WUNLOCK(vp->v_object); - if (ap->a_rbehind) - *ap->a_rbehind = 0; - if (ap->a_rahead) - *ap->a_rahead = 0; - return (VM_PAGER_OK); + return 0; } /* struct vnop_putpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; vm_ooffset_t a_offset; }; */ static int fuse_vnop_putpages(struct vop_putpages_args *ap) { struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; int i, error, npages, count; off_t offset; int *rtvals; struct vnode *vp; struct thread *td; struct ucred *cred; vm_page_t *pages; vm_ooffset_t fsize; FS_DEBUG2G("heh\n"); vp = ap->a_vp; KASSERT(vp->v_object, ("objectless vp passed to putpages")); fsize = vp->v_object->un_pager.vnp.vnp_size; td = curthread; /* XXX */ cred = curthread->td_ucred; /* XXX */ pages = ap->a_m; count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); offset = IDX_TO_OFF(pages[0]->pindex); if (!fsess_opt_mmap(vnode_mount(vp))) { FS_DEBUG("called on non-cacheable vnode??\n"); } for (i = 0; i < npages; i++) rtvals[i] = VM_PAGER_AGAIN; /* * When putting pages, do not extend file past EOF. */ if (offset + count > fsize) { count = fsize - offset; if (count < 0) count = 0; } /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ bp = getpbuf(&fuse_pbuf_freecnt); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); PCPU_INC(cnt.v_vnodeout); PCPU_ADD(cnt.v_vnodepgsout, count); iov.iov_base = (caddr_t)kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = offset; uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_td = td; error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); relpbuf(bp, &fuse_pbuf_freecnt); if (!error) { int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; for (i = 0; i < nwritten; i++) { rtvals[i] = VM_PAGER_OK; VM_OBJECT_WLOCK(pages[i]->object); vm_page_undirty(pages[i]); VM_OBJECT_WUNLOCK(pages[i]->object); } } return rtvals[0]; } /* struct vnop_print_args { struct vnode *a_vp; }; */ static int fuse_vnop_print(struct vop_print_args *ap) { struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp); printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n", (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid, (uintmax_t)fvdat->nlookup, fvdat->flag); return 0; } + +/* + struct vop_getextattr_args { + struct vnode *a_vp; + int a_attrnamespace; + const char *a_name; + struct uio *a_uio; + size_t *a_size; + struct ucred *a_cred; + struct thread *a_td; + }; +*/ +static int +fuse_vnop_getextattr(struct vop_getextattr_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct fuse_dispatcher fdi; + struct fuse_getxattr_in *fgin; + struct fuse_getxattr_out *fgout; + int err = 0; + size_t len; + + FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); + + /* Fuse only supports one attribute name space, so make it "user". */ + if (ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) + return (ENOATTR); + if (fuse_isdeadfs(vp) != 0) + return (ENXIO); + len = strlen(ap->a_name); + fdisp_init(&fdi, sizeof(struct fuse_getxattr_in) + len + 1); + fdisp_make_vp(&fdi, FUSE_GETXATTR, vp, curthread, ap->a_cred); + fgin = (struct fuse_getxattr_in *)fdi.indata; + if (uio != NULL) + fgin->size = uio->uio_resid; + else if (ap->a_size != NULL) + fgin->size = 0; + else { + err = EPERM; + goto out; + } + fgin++; + memcpy(fgin, ap->a_name, len + 1); + + err = fdisp_wait_answ(&fdi); + if (err != 0) + goto out; + + if (uio != NULL) { + if (ap->a_size != NULL) + *ap->a_size = fdi.iosize; + if (fdi.iosize > 0) + err = uiomove(fdi.answ, fdi.iosize, uio); + } else if (ap->a_size != NULL) { + /* When size is set to 0, it returns the size. */ + fgout = (struct fuse_getxattr_out *)fdi.answ; + *ap->a_size = fgout->size; + } +out: + fdisp_destroy(&fdi); + return (err); +} + +/* + struct vop_setextattr_args { + struct vop_generic_args a_gen; + struct vnode *a_vp; + int a_attrnamespace; + const char *a_name; + struct uio *a_uio; + struct ucred *a_cred; + struct thread *a_td; + }; +*/ +static int +fuse_vnop_setextattr(struct vop_setextattr_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct fuse_dispatcher fdi; + struct fuse_setxattr_in *fsin; + int err = 0; + size_t len; + char *cp; + + FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); + + /* Fuse only supports one attribute name space, so make it "user". */ + if (ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) + return (ENOATTR); + if (fuse_isdeadfs(vp) != 0) + return (ENXIO); + len = strlen(ap->a_name); + fdisp_init(&fdi, sizeof(struct fuse_setxattr_in) + len + 1 + + uio->uio_resid); + fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, curthread, ap->a_cred); + fsin = (struct fuse_setxattr_in *)fdi.indata; + fsin->size = uio->uio_resid; + fsin->flags = 0; + fsin++; + cp = (char *)fsin; + memcpy(cp, ap->a_name, len + 1); + cp += len + 1; + err = uiomove(cp, uio->uio_resid, uio); + if (err != 0) + goto out; + + err = fdisp_wait_answ(&fdi); +out: + fdisp_destroy(&fdi); + return (err); +} + +/* + struct vop_listextattr_args { + struct vop_generic_args a_gen; + struct vnode *a_vp; + int a_attrnamespace; + struct uio *a_uio; + size_t *a_size; + struct ucred *a_cred; + struct thread *a_td; + }; +*/ +static int +fuse_vnop_listextattr(struct vop_listextattr_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct fuse_dispatcher fdi; + struct fuse_getxattr_in *fgin; + struct fuse_getxattr_out *fgout; + int err = 0; + + FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); + + /* Fuse only supports one attribute name space, so make it "user". */ + if (ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) + return (ENOATTR); + if (fuse_isdeadfs(vp) != 0) + return (ENXIO); + fdisp_init(&fdi, sizeof(struct fuse_getxattr_in)); + fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, curthread, ap->a_cred); + fgin = (struct fuse_getxattr_in *)fdi.indata; + if (uio != NULL) + fgin->size = uio->uio_resid; + else if (ap->a_size != NULL) + fgin->size = 0; + else { + err = EPERM; + goto out; + } + + err = fdisp_wait_answ(&fdi); + if (err != 0) + goto out; + + if (uio != NULL) { + if (ap->a_size != NULL) + *ap->a_size = fdi.iosize; + if (fdi.iosize > 0) + err = uiomove(fdi.answ, fdi.iosize, uio); + } else if (ap->a_size != NULL) { + /* When size is set to 0, it returns the size. */ + fgout = (struct fuse_getxattr_out *)fdi.answ; + *ap->a_size = fgout->size; + } +out: + fdisp_destroy(&fdi); + return (err); +} + +/* + struct vop_deleteextattr_args { + struct vop_generic_args a_gen; + struct vnode *a_vp; + int a_attrnamespace; + const char *a_name; + struct ucred *a_cred; + struct thread *a_td; + }; +*/ +static int +fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct fuse_dispatcher fdi; + int err = 0; + size_t len; + char *cp; + + FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); + + /* Fuse only supports one attribute name space, so make it "user". */ + if (ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) + return (ENOATTR); + if (fuse_isdeadfs(vp) != 0) + return (ENXIO); + len = strlen(ap->a_name); + fdisp_init(&fdi, len + 1); + fdisp_make_vp(&fdi, FUSE_REMOVEXATTR, vp, curthread, ap->a_cred); + cp = (char *)fdi.indata; + memcpy(cp, ap->a_name, len + 1); + + err = fdisp_wait_answ(&fdi); + + fdisp_destroy(&fdi); + return (err); +} + +/* + struct vop_advlock_args { + struct vop_generic_args a_gen; + struct vnode *a_vp; + void *a_id; + int a_op; + struct flock *a_fl; + int a_flags; + }; + */ +static int +fuse_vnop_advlock(struct vop_advlock_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct flock *flp = ap->a_fl; + struct ucred *cred; + struct proc *p = (struct proc *)ap->a_id; + struct fuse_dispatcher fdi; + struct fuse_lk_in *fli; + struct fuse_lk_out *flo; + struct fuse_filehandle *fufh = NULL; + enum fuse_opcode op; + fufh_type_t fufhtype = FUFH_INVALID; + int err = 0, ret; + off_t start, end, oadd; + pid_t pid; + static int non_event; + + FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); + + if (fuse_isdeadfs(vp) != 0) + return (ENXIO); + if ((ap->a_flags & (F_POSIX | F_FLOCK | F_REMOTE)) == 0) + return (EPERM); + if (vp->v_type != VREG) + return (EINVAL); + + /* Set up the parameters for the lock operation. */ + if (ap->a_op == F_UNLCK) { + op = FUSE_SETLK; + fufhtype = FUFH_RDONLY; + } else if (ap->a_op == F_SETLK) { + op = FUSE_SETLK; + if (flp->l_type == F_WRLCK) + fufhtype = FUFH_WRONLY; + else + fufhtype = FUFH_RDONLY; + } else if (ap->a_op == F_GETLK) { + op = FUSE_GETLK; + fufhtype = FUFH_RDONLY; + } + err = fuse_filehandle_getrw(vp, fufhtype, &fufh); + if (err != 0) + return (ENXIO); + + /* + * The start and end are defined as u64, but the GlusterFS code + * uses OFFSET_MAX, which is defined as the maximum signed + * 64bit in GlusterFS, so I've used OFF_MAX, which is the same. + */ + if ((ap->a_flags & F_FLOCK) != 0) { + cred = curthread->td_ucred; + start = 0; + end = OFF_MAX; + pid = 0; + } else { + /* POSIX lock. */ + cred = p->p_ucred; + start = flp->l_start; + if (start < 0) + return (EINVAL); + if (flp->l_len < 0) { + if (start == 0) + return (EINVAL); + end = start - 1; + start += flp->l_len; + if (start < 0) + return (EINVAL); + } else if (flp->l_len == 0) { + end = OFF_MAX; + } else { + oadd = flp->l_len - 1; + if (oadd > OFF_MAX - start) + return (EOVERFLOW); + end = start + oadd; + } + pid = flp->l_pid; + } + + /* + * I am not sure if serializing the lock operations is required, + * but I chose to do so using the exclusive vnode lock. + */ + err = vn_lock(vp, LK_EXCLUSIVE); + if (err != 0) + return (EBADF); + + /* + * Although fuse has a FUSE_SETLKW option, I didn't like the idea + * of waiting indefinitely for an upcall to reply. I also don't + * know how to interrupt a fuse operation in progress when a + * termination signal is posted to the process/thread. + * So, instead of using FUSE_SETLKW, the code uses FUSE_SETLK and then + * polls while it returns EAGAIN, after a one tick sleep. + * If a termination signal is posted to the process/thread, tsleep() + * should return EINTR and then this function will return without + * doing the lock. + */ + do { + /* Now, do the lock operation. */ + fdisp_init(&fdi, sizeof(struct fuse_lk_in)); + fdisp_make_vp(&fdi, op, vp, curthread, cred); + fli = (struct fuse_lk_in *)fdi.indata; + fli->lk.start = start; + fli->lk.end = end; + fli->lk.pid = pid; + if ((ap->a_flags & F_REMOTE) != 0) { + /* Make the owner a combination of l_pid and l_sysid. */ + fli->owner = (uint64_t)flp->l_sysid; + fli->owner = (fli->owner << 32) | (flp->l_pid & + 0xffffffff); + } else + fli->owner = (uint64_t)ap->a_id; + fli->lk.type = flp->l_type; + fli->fh = fufh->fh_id; + + err = fdisp_wait_answ(&fdi); + + if (err == 0 && op == FUSE_GETLK && (ap->a_flags & F_POSIX) + != 0) { + flo = (struct fuse_lk_out *)fdi.answ; + flp->l_type = flo->lk.type; + if (flp->l_type != F_UNLCK) { + flp->l_start = flo->lk.start; + /* lk.end is u64, but OFF_MAX is INT64_MAX. */ + if (flo->lk.end > OFF_MAX) + flo->lk.end = OFF_MAX; + if (flo->lk.end == OFF_MAX) + flp->l_len = 0; + else + flp->l_len = flo->lk.end - flo->lk.start + + 1; + flp->l_pid = flo->lk.pid; + } + } + fdisp_destroy(&fdi); + /* Sleep for a little while, if we are going to loop. */ + if (err == EAGAIN && (ap->a_flags & F_WAIT) != 0 && + ap->a_op == F_SETLK) { + VOP_UNLOCK(vp, 0); + ret = tsleep(&non_event, PZERO | PCATCH, "fuslck", 1); + if (ret == EINTR) + return (ret); + ret = vn_lock(vp, LK_EXCLUSIVE); + if (ret != 0) + return (EBADF); + } + } while (err == EAGAIN && (ap->a_flags & F_WAIT) != 0 && + ap->a_op == F_SETLK); + + VOP_UNLOCK(vp, 0); + if (err == EINVAL || err == EBADF || err == EINTR || err == EAGAIN) + return (err); + else if (err != 0) + return (EACCES); + return (0); +} + +/* + struct vop_vptofh_args { + struct vop_generic_args a_gen; + struct vnode *a_vp; + struct fid *a_fhp; + }; +*/ +static int +fuse_vnop_vptofh(struct vop_vptofh_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct fid *fid; + struct fuse_fid_data ffid; + struct fuse_vnode_data *fvdat; + + if (fuse_isdeadfs(vp)) + return (ENXIO); + + CTASSERT(MAXFIDSZ >= sizeof(ffid)); + fid = (struct fid *)ap->a_fhp; + fvdat = VTOFUD(vp); + fid->fid_len = offsetof(struct fid, fid_data) + sizeof(ffid); + fid->fid_data0 = (u_short)vp->v_type; + ffid.ffid_nid = fvdat->nid; + ffid.ffid_parent_nid = fvdat->parent_nid; + memcpy(fid->fid_data, &ffid, sizeof(ffid)); + return (0); +} +